1 /* join - join lines of two files on a common field
2 Copyright (C) 1991 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
21 #if defined (CONFIG_BROKETS)
22 /* We use <config.h> instead of "config.h" so that a compilation
23 using -I. -I$srcdir will use ./config.h rather than $srcdir/config.h
24 (which it would do because it found this file in $srcdir). */
31 /* Get isblank from GNU libc. */
35 #include <sys/types.h>
39 #include "long-options.h"
46 #define min(A, B) ((A) < (B) ? (A) : (B))
47 #define max(A, B) ((A) > (B) ? (A) : (B))
49 /* An element of the list describing the format of each
53 int file; /* File to take field from (1 or 2). */
54 int field; /* Field number to print. */
58 /* A field of a line. */
61 char *beg; /* First character in field. */
62 char *lim; /* Character after last character in field. */
65 /* A line read from an input file. Newlines are not stored. */
68 char *beg; /* First character in line. */
69 char *lim; /* Character after last character in line. */
70 int nfields; /* Number of elements in `fields'. */
74 /* One or more consecutive lines read from a file that all have the
75 same join field value. */
78 int count; /* Elements used in `lines'. */
79 int alloc; /* Elements allocated in `lines'. */
83 /* The name this program was run with. */
86 /* If nonzero, print unpairable lines in file 1 or 2. */
87 static int print_unpairables_1, print_unpairables_2;
89 /* If nonzero, print pairable lines. */
90 static int print_pairables;
92 /* Empty output field filler. */
93 static char *empty_filler;
95 /* Field to join on. */
96 static int join_field_1, join_field_2;
98 /* List of fields to print. */
99 static struct outlist *outlist;
101 /* Last element in `outlist', where a new element can be added. */
102 static struct outlist *outlist_end;
104 /* Tab character separating fields; if this is NUL fields are separated
105 by any nonempty string of white space, otherwise by exactly one
109 /* When using getopt_long_only, no long option can start with
110 a character that is a short option. */
111 static struct option const longopts[] =
113 {"j", required_argument, NULL, 'j'},
114 {"j1", required_argument, NULL, '1'},
115 {"j2", required_argument, NULL, '2'},
119 /* Used to print non-joining lines */
120 static struct line blank1;
121 static struct line blank2;
123 /* Fill in the `fields' structure in LINE. */
129 static int nfields = 2;
131 register char *ptr, *lim;
133 line->fields = (struct field *) xmalloc (nfields * sizeof (struct field));
138 for (i = 0; ptr < lim; ++i)
143 line->fields = (struct field *)
144 xrealloc ((char *) line->fields, nfields * sizeof (struct field));
148 line->fields[i].beg = ptr;
149 while (ptr < lim && *ptr != tab)
151 line->fields[i].lim = ptr;
157 line->fields[i].beg = ptr;
158 while (ptr < lim && !ISSPACE (*ptr))
160 line->fields[i].lim = ptr;
161 while (ptr < lim && ISSPACE (*ptr))
169 /* Read a line from FP into LINE and split it into fields.
170 Return 0 if EOF, 1 otherwise. */
177 static int linesize = 80;
184 ptr = xmalloc (linesize);
186 for (i = 0; (c = getc (fp)) != EOF && c != '\n'; ++i)
191 ptr = xrealloc (ptr, linesize);
196 if (c == EOF && i == 0)
203 line->lim = line->beg + i;
212 free ((char *) line->fields);
222 seq->lines = (struct line *) xmalloc (seq->alloc * sizeof (struct line));
225 /* Read a line from FP and add it to SEQ. Return 0 if EOF, 1 otherwise. */
232 if (seq->count == seq->alloc)
235 seq->lines = (struct line *)
236 xrealloc ((char *) seq->lines, seq->alloc * sizeof (struct line));
239 if (get_line (fp, &seq->lines[seq->count]))
251 free ((char *) seq->lines);
254 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
255 >0 if it compares greater; 0 if it compares equal. */
258 keycmp (line1, line2)
262 char *beg1, *beg2; /* Start of field to compare in each file. */
263 int len1, len2; /* Length of fields to compare. */
266 if (join_field_1 < line1->nfields)
268 beg1 = line1->fields[join_field_1].beg;
269 len1 = line1->fields[join_field_1].lim
270 - line1->fields[join_field_1].beg;
278 if (join_field_2 < line2->nfields)
280 beg2 = line2->fields[join_field_2].beg;
281 len2 = line2->fields[join_field_2].lim
282 - line2->fields[join_field_2].beg;
291 return len2 == 0 ? 0 : -1;
294 diff = memcmp (beg1, beg2, min (len1, len2));
300 /* Print field N of LINE if it exists and is nonempty, otherwise
301 `empty_filler' if it is nonempty. */
310 if (n < line->nfields)
312 len = line->fields[n].lim - line->fields[n].beg;
314 fwrite (line->fields[n].beg, 1, len, stdout);
315 else if (empty_filler)
316 fputs (empty_filler, stdout);
318 else if (empty_filler)
319 fputs (empty_filler, stdout);
322 /* Print LINE, with its fields separated by `tab'. */
330 for (i = 0; i < line->nfields; ++i)
333 if (i == line->nfields - 1)
336 putchar (tab ? tab : ' ');
340 /* Print the join of LINE1 and LINE2. */
343 prjoin (line1, line2)
351 prfield (outlist->field - 1, outlist->file == 1 ? line1 : line2);
352 for (o = outlist->next; o; o = o->next)
354 putchar (tab ? tab : ' ');
355 prfield (o->field - 1, o->file == 1 ? line1 : line2);
363 prfield (join_field_1, line1);
364 for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
366 putchar (tab ? tab : ' ');
369 for (i = join_field_1 + 1; i < line1->nfields; ++i)
371 putchar (tab ? tab : ' ');
375 for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
377 putchar (tab ? tab : ' ');
380 for (i = join_field_2 + 1; i < line2->nfields; ++i)
382 putchar (tab ? tab : ' ');
389 /* Print the join of the files in FP1 and FP2. */
396 struct seq seq1, seq2;
398 int diff, i, j, eof1, eof2;
400 /* Read the first line of each file. */
406 while (seq1.count && seq2.count)
408 diff = keycmp (&seq1.lines[0], &seq2.lines[0]);
411 if (print_unpairables_1)
412 prjoin (&seq1.lines[0], &blank2);
413 freeline (&seq1.lines[0]);
420 if (print_unpairables_2)
421 prjoin (&blank1, &seq2.lines[0]);
422 freeline (&seq2.lines[0]);
428 /* Keep reading lines from file1 as long as they continue to
429 match the current line from file2. */
432 if (!getseq (fp1, &seq1))
438 while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
440 /* Keep reading lines from file2 as long as they continue to
441 match the current line from file1. */
444 if (!getseq (fp2, &seq2))
450 while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
454 for (i = 0; i < seq1.count - 1; ++i)
455 for (j = 0; j < seq2.count - 1; ++j)
456 prjoin (&seq1.lines[i], &seq2.lines[j]);
459 for (i = 0; i < seq1.count - 1; ++i)
460 freeline (&seq1.lines[i]);
463 seq1.lines[0] = seq1.lines[seq1.count - 1];
469 for (i = 0; i < seq2.count - 1; ++i)
470 freeline (&seq2.lines[i]);
473 seq2.lines[0] = seq2.lines[seq2.count - 1];
480 if (print_unpairables_1 && seq1.count)
482 prjoin(&seq1.lines[0], &blank2);
483 freeline (&seq1.lines[0]);
484 while (get_line (fp1, &line))
486 prjoin(&line, &blank2);
491 if (print_unpairables_2 && seq2.count)
493 prjoin(&blank1, &seq2.lines[0]);
494 freeline (&seq2.lines[0]);
495 while (get_line (fp2, &line))
497 prjoin(&blank1, &line);
506 /* Add a field spec for field FIELD of file FILE to `outlist' and return 1,
507 unless either argument is invalid; then just return 0. */
510 add_field (file, field)
516 if (file < 1 || file > 2 || field < 1)
518 o = (struct outlist *) xmalloc (sizeof (struct outlist));
523 /* Add to the end of the list so the fields are in the right order. */
527 outlist_end->next = o;
533 /* Add the comma or blank separated field spec(s) in STR to `outlist'.
534 Return the number of fields added. */
541 int file = -1, field = -1;
546 if (*str == ',' || ISBLANK (*str))
548 added += add_field (file, field);
550 case 1: blank1.nfields = max(blank1.nfields, field); break;
551 case 2: blank2.nfields = max(blank2.nfields, field); break;
556 else if (*str == '.')
558 else if (ISDIGIT (*str))
564 file = file * 10 + *str - '0';
570 field = field * 10 + *str - '0';
577 added += add_field (file, field);
581 /* Create a blank line with COUNT fields separated by tabs. */
584 make_blank (blank, count)
589 blank->beg = xmalloc(blank->nfields + 1);
590 blank->fields = (struct field *)xmalloc(sizeof(struct field) * count);
591 for (i = 0; i < blank->nfields; i++) {
592 blank->beg[i] = '\t';
593 blank->fields[i].lim = blank->fields[i].beg = &blank->beg[i];
595 blank->beg[i] = '\0';
596 blank->lim = &blank->beg[i];
606 int optc, prev_optc = 0, nfiles, val;
611 program_name = argv[0];
613 parse_long_options (argc, argv, usage);
615 /* Now that we've seen the options, we can construct the blank line
617 make_blank(&blank1, blank1.nfields);
618 make_blank(&blank2, blank2.nfields);
623 while ((optc = getopt_long_only (argc, argv, "-a:e:1:2:o:t:v:", longopts,
634 print_unpairables_1 = 1;
636 print_unpairables_2 = 1;
638 error (2, 0, "invalid file number for `-a'");
642 empty_filler = optarg;
648 error (2, 0, "invalid field number for `-1'");
649 join_field_1 = val - 1;
655 error (2, 0, "invalid field number for `-2'");
656 join_field_2 = val - 1;
662 error (2, 0, "invalid field number for `-j'");
663 join_field_1 = join_field_2 = val - 1;
667 if (add_field_list (optarg) == 0)
668 error (2, 0, "invalid field list for `-o'");
678 print_unpairables_1 = 1;
680 print_unpairables_2 = 1;
682 error (2, 0, "invalid file number for `-v'");
686 case 1: /* Non-option argument. */
687 if (prev_optc == 'o')
689 /* Might be continuation of args to -o. */
690 if (add_field_list (optarg) > 0)
691 continue; /* Don't change `prev_optc'. */
696 names[nfiles++] = optarg;
708 fp1 = strcmp (names[0], "-") ? fopen (names[0], "r") : stdin;
710 error (1, errno, "%s", names[0]);
711 fp2 = strcmp (names[1], "-") ? fopen (names[1], "r") : stdin;
713 error (1, errno, "%s", names[1]);
715 error (1, errno, "both files cannot be standard input");
718 if ((fp1 == stdin || fp2 == stdin) && fclose (stdin) == EOF)
719 error (1, errno, "-");
720 if (ferror (stdout) || fclose (stdout) == EOF)
721 error (1, errno, "write error");
731 fprintf (stderr, "Try `%s --help' for more information.\n",
736 Usage: %s [OPTION]... FILE1 FILE2\n\
741 -a SIDE print unpairable lines coming from file SIDE\n\
742 -e EMPTY replace missing input fields with EMPTY\n\
743 -j FIELD join on this FIELD for both files\n\
744 -[j]SIDE FIELD join on this FIELD for file SIDE\n\
745 -o FORMAT obey FORMAT while constructing output line\n\
746 -t CHAR use CHAR as input and output field separator\n\
747 -v SIDE like -a SIDE, but suppress joined output lines\n\
748 --help display this help and exit\n\
749 --version output version information and exit\n\
751 When FILE1 or FILE2 is -, not both, read standard input. SIDE is 1\n\
752 for FILE1 or 2 for FILE2. Unless -t CHAR is given, leading blanks\n\
753 separate fields and are ignored, else fields are separated by CHAR.\n\
754 Any FIELD is a field number counted from 1. FORMAT is one or more\n\
755 comma or blank separated specifications, each being `SIDE.FIELD'.\n\
756 Default FORMAT outputs the join field, the remaining fields from\n\
757 FILE1, the remaining fields from FILE2, all separated by CHAR.\n\