1 /* join - join lines of two files on a common field
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
22 /* Get isblank from GNU libc. */
26 #include <sys/types.h>
30 #include "long-options.h"
33 #define join system_join
38 /* Undefine, to avoid warning about redefinition on some systems. */
41 #define min(A, B) ((A) < (B) ? (A) : (B))
42 #define max(A, B) ((A) > (B) ? (A) : (B))
44 /* An element of the list describing the format of each
48 int file; /* File to take field from (1 or 2). */
49 int field; /* Field number to print. */
53 /* A field of a line. */
56 const char *beg; /* First character in field. */
57 size_t len; /* The length of the field. */
60 /* A line read from an input file. Newlines are not stored. */
63 char *beg; /* First character in line. */
64 char *lim; /* Character after last character in line. */
65 int nfields; /* Number of elements in `fields'. */
66 int nfields_allocated; /* Number of elements in `fields'. */
70 /* One or more consecutive lines read from a file that all have the
71 same join field value. */
74 int count; /* Elements used in `lines'. */
75 int alloc; /* Elements allocated in `lines'. */
79 /* The name this program was run with. */
82 /* If nonzero, print unpairable lines in file 1 or 2. */
83 static int print_unpairables_1, print_unpairables_2;
85 /* If nonzero, print pairable lines. */
86 static int print_pairables;
88 /* Empty output field filler. */
89 static char *empty_filler;
91 /* Field to join on. */
92 static int join_field_1, join_field_2;
94 /* List of fields to print. */
95 static struct outlist *outlist;
97 /* Last element in `outlist', where a new element can be added. */
98 static struct outlist *outlist_end;
100 /* Tab character separating fields; if this is NUL fields are separated
101 by any nonempty string of white space, otherwise by exactly one
105 /* When using getopt_long_only, no long option can start with
106 a character that is a short option. */
107 static struct option const longopts[] =
109 {"j", required_argument, NULL, 'j'},
110 {"j1", required_argument, NULL, '1'},
111 {"j2", required_argument, NULL, '2'},
115 /* Used to print non-joining lines */
116 static struct line uni_blank;
122 fprintf (stderr, _("Try `%s --help' for more information.\n"),
127 Usage: %s [OPTION]... FILE1 FILE2\n\
131 For each pair of input lines with identical join fields, write a line to\n\
132 standard output. The default join field is the first, delimited\n\
133 by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
135 -a SIDE print unpairable lines coming from file SIDE\n\
136 -e EMPTY replace missing input fields with EMPTY\n\
137 -j FIELD (Obsolescent) equivalent to `-1 FIELD -2 FIELD'\n\
138 -j1 FIELD (Obsolescent) equivalent to `-1 FIELD'\n\
139 -j2 FIELD (Obsolescent) equivalent to `-2 FIELD'\n\
140 -1 FIELD join on this FIELD of file 1\n\
141 -2 FIELD join on this FIELD of file 2\n\
142 -o FORMAT obey FORMAT while constructing output line\n\
143 -t CHAR use CHAR as input and output field separator\n\
144 -v SIDE like -a SIDE, but suppress joined output lines\n\
145 --help display this help and exit\n\
146 --version output version information and exit\n\
148 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
149 else fields are separated by CHAR. Any FIELD is a field number counted\n\
150 from 1. FORMAT is one or more comma or blank separated specifications,\n\
151 each being `SIDE.FIELD' or `0'. Default FORMAT outputs the join field,\n\
152 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
153 separated by CHAR.\n\
160 ADD_FIELD (struct line *line, const char *field, size_t len)
162 if (line->nfields >= line->nfields_allocated)
164 line->nfields_allocated = (3 * line->nfields_allocated) / 2 + 1;
165 line->fields = (struct field *) xrealloc ((char *) line->fields,
166 (line->nfields_allocated
167 * sizeof (struct field)));
169 line->fields[line->nfields].beg = field;
170 line->fields[line->nfields].len = len;
174 /* Fill in the `fields' structure in LINE. */
177 xfields (struct line *line)
180 register char *ptr, *lim;
185 for (i = 0; ptr < lim; ++i)
192 while (ptr < lim && *ptr != tab)
194 ADD_FIELD (line, beg, ptr - beg);
203 while (ptr < lim && !ISSPACE (*ptr))
205 ADD_FIELD (line, beg, ptr - beg);
206 while (ptr < lim && ISSPACE (*ptr))
211 if (ptr > line->beg && ((tab && ISSPACE (ptr[-1])) || ptr[-1] == tab))
213 /* Add one more (empty) field because the last character of the
214 line was a delimiter. */
215 ADD_FIELD (line, NULL, 0);
219 /* Read a line from FP into LINE and split it into fields.
220 Return 0 if EOF, 1 otherwise. */
223 get_line (FILE *fp, struct line *line)
225 static int linesize = 80;
232 ptr = xmalloc (linesize);
234 for (i = 0; (c = getc (fp)) != EOF && c != '\n'; ++i)
239 ptr = xrealloc (ptr, linesize);
244 if (c == EOF && i == 0)
251 line->lim = line->beg + i;
252 line->nfields_allocated = 0;
260 freeline (struct line *line)
262 free ((char *) line->fields);
268 initseq (struct seq *seq)
272 seq->lines = (struct line *) xmalloc (seq->alloc * sizeof (struct line));
275 /* Read a line from FP and add it to SEQ. Return 0 if EOF, 1 otherwise. */
278 getseq (FILE *fp, struct seq *seq)
280 if (seq->count == seq->alloc)
283 seq->lines = (struct line *)
284 xrealloc ((char *) seq->lines, seq->alloc * sizeof (struct line));
287 if (get_line (fp, &seq->lines[seq->count]))
296 delseq (struct seq *seq)
299 for (i = 0; i < seq->count; i++)
300 if (seq->lines[i].beg)
301 freeline (&seq->lines[i]);
302 free ((char *) seq->lines);
305 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
306 >0 if it compares greater; 0 if it compares equal. */
309 keycmp (struct line *line1, struct line *line2)
311 const char *beg1, *beg2; /* Start of field to compare in each file. */
312 int len1, len2; /* Length of fields to compare. */
315 if (join_field_1 < line1->nfields)
317 beg1 = line1->fields[join_field_1].beg;
318 len1 = line1->fields[join_field_1].len;
326 if (join_field_2 < line2->nfields)
328 beg2 = line2->fields[join_field_2].beg;
329 len2 = line2->fields[join_field_2].len;
338 return len2 == 0 ? 0 : -1;
341 diff = memcmp (beg1, beg2, min (len1, len2));
347 /* Print field N of LINE if it exists and is nonempty, otherwise
348 `empty_filler' if it is nonempty. */
351 prfield (int n, struct line *line)
355 if (n < line->nfields)
357 len = line->fields[n].len;
359 fwrite (line->fields[n].beg, 1, len, stdout);
360 else if (empty_filler)
361 fputs (empty_filler, stdout);
363 else if (empty_filler)
364 fputs (empty_filler, stdout);
367 /* Print the join of LINE1 and LINE2. */
370 prjoin (struct line *line1, struct line *line2)
376 prfield (outlist->field - 1, outlist->file == 1 ? line1 : line2);
377 for (o = outlist->next; o; o = o->next)
379 putchar (tab ? tab : ' ');
380 prfield (o->field - 1, o->file == 1 ? line1 : line2);
388 if (line1 == &uni_blank)
395 prfield (join_field_1, line1);
396 for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
398 putchar (tab ? tab : ' ');
401 for (i = join_field_1 + 1; i < line1->nfields; ++i)
403 putchar (tab ? tab : ' ');
407 for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
409 putchar (tab ? tab : ' ');
412 for (i = join_field_2 + 1; i < line2->nfields; ++i)
414 putchar (tab ? tab : ' ');
421 /* Print the join of the files in FP1 and FP2. */
428 struct seq seq1, seq2;
430 int diff, i, j, eof1, eof2;
432 /* Read the first line of each file. */
438 while (seq1.count && seq2.count)
440 diff = keycmp (&seq1.lines[0], &seq2.lines[0]);
443 if (print_unpairables_1)
444 prjoin (&seq1.lines[0], &uni_blank);
445 freeline (&seq1.lines[0]);
452 if (print_unpairables_2)
453 prjoin (&uni_blank, &seq2.lines[0]);
454 freeline (&seq2.lines[0]);
460 /* Keep reading lines from file1 as long as they continue to
461 match the current line from file2. */
464 if (!getseq (fp1, &seq1))
470 while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
472 /* Keep reading lines from file2 as long as they continue to
473 match the current line from file1. */
476 if (!getseq (fp2, &seq2))
482 while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
486 for (i = 0; i < seq1.count - 1; ++i)
487 for (j = 0; j < seq2.count - 1; ++j)
488 prjoin (&seq1.lines[i], &seq2.lines[j]);
491 for (i = 0; i < seq1.count - 1; ++i)
492 freeline (&seq1.lines[i]);
495 seq1.lines[0] = seq1.lines[seq1.count - 1];
501 for (i = 0; i < seq2.count - 1; ++i)
502 freeline (&seq2.lines[i]);
505 seq2.lines[0] = seq2.lines[seq2.count - 1];
512 if (print_unpairables_1 && seq1.count)
514 prjoin (&seq1.lines[0], &uni_blank);
515 freeline (&seq1.lines[0]);
516 while (get_line (fp1, &line))
518 prjoin (&line, &uni_blank);
523 if (print_unpairables_2 && seq2.count)
525 prjoin (&uni_blank, &seq2.lines[0]);
526 freeline (&seq2.lines[0]);
527 while (get_line (fp2, &line))
529 prjoin (&uni_blank, &line);
538 /* Add a field spec for field FIELD of file FILE to `outlist' and return 1,
539 unless either argument is invalid; then just return 0. */
542 add_field (int file, int field)
546 if (file < 1 || file > 2 || field < 1)
548 o = (struct outlist *) xmalloc (sizeof (struct outlist));
553 /* Add to the end of the list so the fields are in the right order. */
557 outlist_end->next = o;
563 /* Add the comma or blank separated field spec(s) in STR to `outlist'.
564 Return the number of fields added. */
567 add_field_list (char *str)
570 int file = -1, field = -1;
575 if (*str == ',' || ISBLANK (*str))
577 added += add_field (file, field);
578 uni_blank.nfields = max (uni_blank.nfields, field);
582 else if (*str == '.')
584 else if (ISDIGIT (*str))
590 file = file * 10 + *str - '0';
596 field = field * 10 + *str - '0';
603 uni_blank.nfields = max (uni_blank.nfields, field);
604 added += add_field (file, field);
608 /* Create a blank line with COUNT fields separated by tabs. */
611 make_blank (struct line *blank, int count)
614 blank->nfields = count;
615 blank->beg = xmalloc (blank->nfields + 1);
616 blank->fields = (struct field *) xmalloc (sizeof (struct field) * count);
617 for (i = 0; i < blank->nfields; i++)
619 blank->beg[i] = '\t';
620 blank->fields[i].beg = &blank->beg[i];
621 blank->fields[i].len = 0;
623 blank->beg[i] = '\0';
624 blank->lim = &blank->beg[i];
628 main (int argc, char **argv)
632 int optc, prev_optc = 0, nfiles, val;
634 program_name = argv[0];
636 /* Initialize this before parsing options. In parsing options,
637 it may be increased. */
638 uni_blank.nfields = 1;
640 parse_long_options (argc, argv, "join", version_string, usage);
645 while ((optc = getopt_long_only (argc, argv, "-a:e:1:2:o:t:v:", longopts,
656 print_unpairables_1 = 1;
658 print_unpairables_2 = 1;
660 error (2, 0, _("invalid file number for `-a'"));
664 empty_filler = optarg;
670 error (2, 0, _("invalid field number for `-1'"));
671 join_field_1 = val - 1;
677 error (2, 0, _("invalid field number for `-2'"));
678 join_field_2 = val - 1;
684 error (2, 0, _("invalid field number for `-j'"));
685 join_field_1 = join_field_2 = val - 1;
689 if (add_field_list (optarg) == 0)
690 error (2, 0, _("invalid field list for `-o'"));
700 print_unpairables_1 = 1;
702 print_unpairables_2 = 1;
704 error (2, 0, _("invalid file number for `-v'"));
708 case 1: /* Non-option argument. */
709 if (prev_optc == 'o' && optind <= argc - 2)
711 /* Might be continuation of args to -o. */
712 if (add_field_list (optarg) > 0)
713 continue; /* Don't change `prev_optc'. */
718 error (0, 0, "too many non-option arguments");
721 names[nfiles++] = optarg;
730 /* Now that we've seen the options, we can construct the blank line
732 make_blank (&uni_blank, uni_blank.nfields);
736 error (0, 0, "too few non-option arguments");
740 fp1 = strcmp (names[0], "-") ? fopen (names[0], "r") : stdin;
742 error (1, errno, "%s", names[0]);
743 fp2 = strcmp (names[1], "-") ? fopen (names[1], "r") : stdin;
745 error (1, errno, "%s", names[1]);
747 error (1, errno, _("both files cannot be standard input"));
750 if ((fp1 == stdin || fp2 == stdin) && fclose (stdin) == EOF)
751 error (1, errno, "-");
752 if (ferror (stdout) || fclose (stdout) == EOF)
753 error (1, errno, _("write error"));