1 /* join - join lines of two files on a common field
2 Copyright (C) 1991 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
23 #define isblank(c) ((c) == ' ' || (c) == '\t')
26 #include <sys/types.h>
31 #define ISSPACE(c) (isascii(c) && isspace(c))
32 #define ISDIGIT(c) (isascii(c) && isdigit(c))
34 #define ISSPACE(c) isspace(c)
35 #define ISDIGIT(c) isdigit(c)
43 #define min(A, B) ((A) < (B) ? (A) : (B))
45 /* An element of the list describing the format of each
49 int file; /* File to take field from (1 or 2). */
50 int field; /* Field number to print. */
54 /* A field of a line. */
57 char *beg; /* First character in field. */
58 char *lim; /* Character after last character in field. */
61 /* A line read from an input file. Newlines are not stored. */
64 char *beg; /* First character in line. */
65 char *lim; /* Character after last character in line. */
66 int nfields; /* Number of elements in `fields'. */
70 /* One or more consecutive lines read from a file that all have the
71 same join field value. */
74 int count; /* Elements used in `lines'. */
75 int alloc; /* Elements allocated in `lines'. */
79 /* If nonzero, print unpairable lines in file 1 or 2. */
80 static int print_unpairables_1, print_unpairables_2;
82 /* If nonzero, print pairable lines. */
83 static int print_pairables;
85 /* Empty output field filler. */
86 static char *empty_filler;
88 /* Field to join on. */
89 static int join_field_1, join_field_2;
91 /* List of fields to print. */
92 struct outlist *outlist;
94 /* Last element in `outlist', where a new element can be added. */
95 struct outlist *outlist_end;
97 /* Tab character separating fields; if this is NUL fields are separated
98 by any nonempty string of white space, otherwise by exactly one
102 /* The name this program was run with. */
105 /* Fill in the `fields' structure in LINE. */
111 static int nfields = 2;
113 register char *ptr, *lim;
115 line->fields = (struct field *) xmalloc (nfields * sizeof (struct field));
120 for (i = 0; ptr < lim; ++i)
125 line->fields = (struct field *)
126 xrealloc ((char *) line->fields, nfields * sizeof (struct field));
130 line->fields[i].beg = ptr;
131 while (ptr < lim && *ptr != tab)
133 line->fields[i].lim = ptr;
139 line->fields[i].beg = ptr;
140 while (ptr < lim && !ISSPACE (*ptr))
142 line->fields[i].lim = ptr;
143 while (ptr < lim && ISSPACE (*ptr))
151 /* Read a line from FP into LINE and split it into fields.
152 Return 0 if EOF, 1 otherwise. */
159 static int linesize = 80;
166 ptr = xmalloc (linesize);
168 for (i = 0; (c = getc (fp)) != EOF && c != '\n'; ++i)
173 ptr = xrealloc (ptr, linesize);
178 if (c == EOF && i == 0)
185 line->lim = line->beg + i;
194 free ((char *) line->fields);
204 seq->lines = (struct line *) xmalloc (seq->alloc * sizeof (struct line));
207 /* Read a line from FP and add it to SEQ. Return 0 if EOF, 1 otherwise. */
214 if (seq->count == seq->alloc)
217 seq->lines = (struct line *)
218 xrealloc ((char *) seq->lines, seq->alloc * sizeof (struct line));
221 if (get_line (fp, &seq->lines[seq->count]))
233 free ((char *) seq->lines);
236 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
237 >0 if it compares greater; 0 if it compares equal. */
240 keycmp (line1, line2)
244 char *beg1, *beg2; /* Start of field to compare in each file. */
245 int len1, len2; /* Length of fields to compare. */
248 if (join_field_1 < line1->nfields)
250 beg1 = line1->fields[join_field_1].beg;
251 len1 = line1->fields[join_field_1].lim
252 - line1->fields[join_field_1].beg;
260 if (join_field_2 < line2->nfields)
262 beg2 = line2->fields[join_field_2].beg;
263 len2 = line2->fields[join_field_2].lim
264 - line2->fields[join_field_2].beg;
273 return len2 == 0 ? 0 : -1;
276 diff = memcmp (beg1, beg2, min (len1, len2));
282 /* Print field N of LINE if it exists and is nonempty, otherwise
283 `empty_filler' if it is nonempty. */
292 if (n < line->nfields)
294 len = line->fields[n].lim - line->fields[n].beg;
296 fwrite (line->fields[n].beg, 1, len, stdout);
297 else if (empty_filler)
298 fputs (empty_filler, stdout);
300 else if (empty_filler)
301 fputs (empty_filler, stdout);
304 /* Print LINE, with its fields separated by `tab'. */
312 for (i = 0; i < line->nfields; ++i)
315 if (i == line->nfields - 1)
318 putchar (tab ? tab : ' ');
322 /* Print the join of LINE1 and LINE2. */
325 prjoin (line1, line2)
333 prfield (outlist->field - 1, outlist->file == 1 ? line1 : line2);
334 for (o = outlist->next; o; o = o->next)
336 putchar (tab ? tab : ' ');
337 prfield (o->field - 1, o->file == 1 ? line1 : line2);
345 prfield (join_field_1, line1);
346 for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
348 putchar (tab ? tab : ' ');
351 for (i = join_field_1 + 1; i < line1->nfields; ++i)
353 putchar (tab ? tab : ' ');
357 for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
359 putchar (tab ? tab : ' ');
362 for (i = join_field_2 + 1; i < line2->nfields; ++i)
364 putchar (tab ? tab : ' ');
371 /* Print the join of the files in FP1 and FP2. */
378 struct seq seq1, seq2;
380 int diff, i, j, eof1, eof2;
382 /* Read the first line of each file. */
388 while (seq1.count && seq2.count)
390 diff = keycmp (&seq1.lines[0], &seq2.lines[0]);
393 if (print_unpairables_1)
394 prline (&seq1.lines[0]);
395 freeline (&seq1.lines[0]);
402 if (print_unpairables_2)
403 prline (&seq2.lines[0]);
404 freeline (&seq2.lines[0]);
410 /* Keep reading lines from file1 as long as they continue to
411 match the current line from file2. */
414 if (!getseq (fp1, &seq1))
420 while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
422 /* Keep reading lines from file2 as long as they continue to
423 match the current line from file1. */
426 if (!getseq (fp2, &seq2))
432 while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
436 for (i = 0; i < seq1.count - 1; ++i)
437 for (j = 0; j < seq2.count - 1; ++j)
438 prjoin (&seq1.lines[i], &seq2.lines[j]);
441 for (i = 0; i < seq1.count - 1; ++i)
442 freeline (&seq1.lines[i]);
445 seq1.lines[0] = seq1.lines[seq1.count - 1];
451 for (i = 0; i < seq2.count - 1; ++i)
452 freeline (&seq2.lines[i]);
455 seq2.lines[0] = seq2.lines[seq2.count - 1];
462 if (print_unpairables_1 && seq1.count)
464 prline (&seq1.lines[0]);
465 freeline (&seq1.lines[0]);
466 while (get_line (fp1, &line))
473 if (print_unpairables_2 && seq2.count)
475 prline (&seq2.lines[0]);
476 freeline (&seq2.lines[0]);
477 while (get_line (fp2, &line))
488 /* Add a field spec for field FIELD of file FILE to `outlist' and return 1,
489 unless either argument is invalid; then just return 0. */
492 add_field (file, field)
498 if (file < 1 || file > 2 || field < 1)
500 o = (struct outlist *) xmalloc (sizeof (struct outlist));
505 /* Add to the end of the list so the fields are in the right order. */
509 outlist_end->next = o;
515 /* Add the comma or blank separated field spec(s) in STR to `outlist'.
516 Return the number of fields added. */
523 int file = -1, field = -1;
528 if (*str == ',' || isblank (*str))
530 added += add_field (file, field);
534 else if (*str == '.')
536 else if (ISDIGIT (*str))
542 file = file * 10 + *str - '0';
548 field = field * 10 + *str - '0';
555 added += add_field (file, field);
559 /* When using getopt_long_only, no long option can start with
560 a character that is a short option. */
561 static struct option longopts[] =
564 {"j1", 1, NULL, '1'},
565 {"j2", 1, NULL, '2'},
576 int optc, prev_optc = 0, nfiles, val;
578 program_name = argv[0];
582 while ((optc = getopt_long_only (argc, argv, "-a:e:1:2:o:t:v:", longopts,
590 print_unpairables_1 = 1;
592 print_unpairables_2 = 1;
594 error (2, 0, "invalid file number for `-a'");
598 empty_filler = optarg;
604 error (2, 0, "invalid field number for `-1'");
605 join_field_1 = val - 1;
611 error (2, 0, "invalid field number for `-2'");
612 join_field_2 = val - 1;
618 error (2, 0, "invalid field number for `-j'");
619 join_field_1 = join_field_2 = val - 1;
623 if (add_field_list (optarg) == 0)
624 error (2, 0, "invalid field list for `-o'");
634 print_unpairables_1 = 1;
636 print_unpairables_2 = 1;
638 error (2, 0, "invalid file number for `-v'");
642 case 1: /* Non-option argument. */
643 if (prev_optc == 'o')
645 /* Might be continuation of args to -o. */
646 if (add_field_list (optarg) > 0)
647 continue; /* Don't change `prev_optc'. */
652 names[nfiles++] = optarg;
664 fp1 = strcmp (names[0], "-") ? fopen (names[0], "r") : stdin;
666 error (1, errno, "%s", names[0]);
667 fp2 = strcmp (names[1], "-") ? fopen (names[1], "r") : stdin;
669 error (1, errno, "%s", names[1]);
671 error (1, errno, "both files cannot be standard input");
674 if ((fp1 == stdin || fp2 == stdin) && fclose (stdin) == EOF)
675 error (1, errno, "-");
676 if (ferror (stdout) || fclose (stdout) == EOF)
677 error (1, 0, "write error");
686 Usage: %s [-a 1|2] [-v 1|2] [-e empty-string] [-o field-list...] [-t char]\n\
687 [-j[1|2] field] [-1 field] [-2 field] file1 file2\n",