src/split.c

   1 /* split.c -- split a file into pieces.
   2    Copyright (C) 1988, 1991 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
  17 \f
  18 /* By tege@sics.se, with rms.
  19
  20    To do:
  21    * Implement -t CHAR or -t REGEX to specify break characters other
  22      than newline. */
  23
  24 #include <stdio.h>
  25 #include <getopt.h>
  26 #include <ctype.h>
  27 #include <sys/types.h>
  28 #include "system.h"
  29
  30 char *xmalloc ();
  31 void error ();
  32
  33 int convint ();
  34 int isdigits ();
  35 int stdread ();
  36 void line_bytes_split ();
  37 void bytes_split ();
  38 void cwrite ();
  39 void lines_split ();
  40 void next_file_name ();
  41
  42 /* Name under which this program was invoked.  */
  43 char *program_name;
  44
  45 /* Base name of output files.  */
  46 char *outfile;
  47
  48 /* Pointer to the end of the prefix in OUTFILE.
  49    Suffixes are inserted here.  */
  50 char *outfile_mid;
  51
  52 /* Pointer to the end of OUTFILE. */
  53 char *outfile_end;
  54
  55 /* Status for outfile name generation.  */
  56 unsigned outfile_count = -1;
  57 unsigned outfile_name_limit = 25 * 26;
  58 unsigned outfile_name_generation = 1;
  59
  60 /* Name of input file.  May be "-".  */
  61 char *infile;
  62
  63 /* Descriptor on which input file is open.  */
  64 int input_desc;
  65
  66 /* Descriptor on which output file is open.  */
  67 int output_desc;
  68 \f
  69 void
  70 usage (reason)
  71     char *reason;
  72 {
  73   if (reason != NULL)
  74     fprintf (stderr, "%s: %s\n", program_name, reason);
  75   fprintf (stderr, "\
  76 Usage: %s [-lines] [-l lines] [-b bytes[bkm]] [-C bytes[bkm]]\n\
  77        [--lines=lines] [--bytes=bytes[bkm]] [--line-bytes=bytes[bkm]]\n\
  78        [infile [outfile-prefix]]\n",
  79            program_name);
  80   exit (2);
  81 }
  82 \f
  83 struct option longopts[] =
  84 {
  85   {"bytes", 1, NULL, 'b'},
  86   {"lines", 1, NULL, 'l'},
  87   {"line-bytes", 1, NULL, 'C'},
  88   {NULL, 0, NULL, 0}
  89 };
  90
  91 void
  92 main (argc, argv)
  93     int argc;
  94     char *argv[];
  95 {
  96   struct stat stat_buf;
  97   int num;                      /* numeric argument from command line */
  98   enum
  99     {
 100       type_undef, type_bytes, type_byteslines, type_lines, type_digits
 101     } split_type = type_undef;
 102   int in_blk_size;              /* optimal block size of input file device */
 103   char *buf;                    /* file i/o buffer */
 104   int accum = 0;
 105   char *outbase;
 106   int c;
 107   int digits_optind = 0;
 108
 109   program_name = argv[0];
 110
 111   /* Parse command line options.  */
 112
 113   infile = "-";
 114   outbase = "x";
 115
 116   while (1)
 117     {
 118       /* This is the argv-index of the option we will read next.  */
 119       int this_optind = optind ? optind : 1;
 120
 121       c = getopt_long (argc, argv, "0123456789b:l:C:", longopts, (int *) 0);
 122       if (c == EOF)
 123         break;
 124
 125       switch (c)
 126         {
 127         case 'b':
 128           if (split_type != type_undef)
 129             usage ("cannot split in more than one way");
 130           split_type = type_bytes;
 131           if (convint (optarg, &accum) == -1)
 132             usage ("invalid number of bytes");
 133           break;
 134
 135         case 'l':
 136           if (split_type != type_undef)
 137             usage ("cannot split in more than one way");
 138           split_type = type_lines;
 139           if (!isdigits (optarg))
 140             usage ("invalid number of lines");
 141           accum = atoi (optarg);
 142           break;
 143
 144         case 'C':
 145           if (split_type != type_undef)
 146             usage ("cannot split in more than one way");
 147           split_type = type_byteslines;
 148           if (convint (optarg, &accum) == -1)
 149             usage ("invalid number of bytes");
 150           break;
 151
 152         case '0':
 153         case '1':
 154         case '2':
 155         case '3':
 156         case '4':
 157         case '5':
 158         case '6':
 159         case '7':
 160         case '8':
 161         case '9':
 162           if (split_type != type_undef && split_type != type_digits)
 163             usage ("cannot split in more than one way");
 164           if (digits_optind != 0 && digits_optind != this_optind)
 165             accum = 0;          /* More than one number given; ignore other. */
 166           digits_optind = this_optind;
 167           split_type = type_digits;
 168           accum = accum * 10 + c - '0';
 169           break;
 170
 171         default:
 172           usage ((char *)0);
 173         }
 174     }
 175
 176   /* Handle default case.  */
 177   if (split_type == type_undef)
 178     {
 179       split_type = type_lines;
 180       accum = 1000;
 181     }
 182
 183   if (accum < 1)
 184     usage ("invalid number");
 185   num = accum;
 186
 187   /* Get out the filename arguments.  */
 188
 189   if (optind < argc)
 190     infile = argv[optind++];
 191
 192   if (optind < argc)
 193     outbase = argv[optind++];
 194
 195   if (optind < argc)
 196     usage ("too many arguments");
 197
 198   /* Open the input file.  */
 199   if (!strcmp (infile, "-"))
 200     input_desc = 0;
 201   else
 202     {
 203       input_desc = open (infile, O_RDONLY);
 204       if (input_desc < 0)
 205         error (1, errno, "%s", infile);
 206     }
 207
 208   /* No output file is open now.  */
 209   output_desc = -1;
 210
 211   /* Copy the output file prefix so we can add suffixes to it.
 212      26**29 is certainly enough output files!  */
 213
 214   outfile = xmalloc (strlen (outbase) + 30);
 215   strcpy (outfile, outbase);
 216   outfile_mid = outfile + strlen (outfile);
 217   outfile_end = outfile_mid + 2;
 218   bzero (outfile_mid, 30);
 219   outfile_mid[0] = 'a';
 220   outfile_mid[1] = 'a' - 1;  /* first call to next_file_name makes it an 'a' */
 221
 222   /* Get the optimal block size of input device and make a buffer.  */
 223
 224   if (fstat (input_desc, &stat_buf) < 0)
 225     error (1, errno, "%s", infile);
 226   in_blk_size = ST_BLKSIZE (stat_buf);
 227
 228   buf = xmalloc (in_blk_size + 1);
 229
 230   switch (split_type)
 231     {
 232     case type_digits:
 233     case type_lines:
 234       lines_split (num, buf, in_blk_size);
 235       break;
 236
 237     case type_bytes:
 238       bytes_split (num, buf, in_blk_size);
 239       break;
 240
 241     case type_byteslines:
 242       line_bytes_split (num);
 243       break;
 244     }
 245
 246   if (close (input_desc) < 0)
 247     error (1, errno, "%s", infile);
 248   if (output_desc >= 0 && close (output_desc) < 0)
 249     error (1, errno, "%s", outfile);
 250
 251   exit (0);
 252 }
 253
 254 /* Return nonzero if the string STR is composed entirely of decimal digits.  */
 255
 256 int
 257 isdigits (str)
 258     char *str;
 259 {
 260   do
 261     {
 262       if (!isdigit (*str))
 263         return 0;
 264       str++;
 265     }
 266   while (*str);
 267   return 1;
 268 }
 269
 270 /* Put the value of the number in STR into *VAL.
 271    STR can specify a positive integer, optionally ending in `k'
 272    to mean kilo or `m' to mean mega.
 273    Return 0 if STR is valid, -1 if not. */
 274
 275 int
 276 convint (str, val)
 277      char *str;
 278      int *val;
 279 {
 280   int multiplier = 1;
 281   int arglen = strlen (str);
 282
 283   if (arglen > 1)
 284     {
 285       switch (str[arglen - 1])
 286         {
 287         case 'b':
 288           multiplier = 512;
 289           str[arglen - 1] = '\0';
 290           break;
 291         case 'k':
 292           multiplier = 1024;
 293           str[arglen - 1] = '\0';
 294           break;
 295         case 'm':
 296           multiplier = 1048576;
 297           str[arglen - 1] = '\0';
 298           break;
 299         }
 300     }
 301   if (!isdigits (str))
 302     return -1;
 303   *val = atoi (str) * multiplier;
 304   return 0;
 305 }
 306 \f
 307 /* Split into pieces of exactly NCHARS bytes.
 308    Use buffer BUF, whose size is BUFSIZE.  */
 309
 310 void
 311 bytes_split (nchars, buf, bufsize)
 312     int nchars;
 313     char *buf;
 314     int bufsize;
 315 {
 316   int n_read;
 317   int new_file_flag = 1;
 318   int to_read;
 319   int to_write = nchars;
 320   char *bp_out;
 321
 322   do
 323     {
 324       n_read = stdread (buf, bufsize);
 325       if (n_read < 0)
 326         error (1, errno, "%s", infile);
 327       bp_out = buf;
 328       to_read = n_read;
 329       for (;;)
 330         {
 331           if (to_read < to_write)
 332             {
 333               if (to_read)      /* do not write 0 bytes! */
 334                 {
 335                   cwrite (new_file_flag, bp_out, to_read);
 336                   to_write -= to_read;
 337                   new_file_flag = 0;
 338                 }
 339               break;
 340             }
 341           else
 342             {
 343               cwrite (new_file_flag, bp_out, to_write);
 344               bp_out += to_write;
 345               to_read -= to_write;
 346               new_file_flag = 1;
 347               to_write = nchars;
 348             }
 349         }
 350     }
 351   while (n_read == bufsize);
 352 }
 353 \f
 354 /* Split into pieces of exactly NLINES lines.
 355    Use buffer BUF, whose size is BUFSIZE.  */
 356
 357 void
 358 lines_split (nlines, buf, bufsize)
 359     int nlines;
 360     char *buf;
 361     int bufsize;
 362 {
 363   int n_read;
 364   char *bp, *bp_out, *eob;
 365   int new_file_flag = 1;
 366   int n = 0;
 367
 368   do
 369     {
 370       n_read = stdread (buf, bufsize);
 371       if (n_read < 0)
 372         error (1, errno, "%s", infile);
 373       bp = bp_out = buf;
 374       eob = bp + n_read;
 375       *eob = '\n';
 376       for (;;)
 377         {
 378           while (*bp++ != '\n')
 379             ;                   /* this semicolon takes most of the time */
 380           if (bp > eob)
 381             {
 382               if (eob != bp_out) /* do not write 0 bytes! */
 383                 {
 384                   cwrite (new_file_flag, bp_out, eob - bp_out);
 385                   new_file_flag = 0;
 386                 }
 387               break;
 388             }
 389           else
 390             if (++n >= nlines)
 391               {
 392                 cwrite (new_file_flag, bp_out, bp - bp_out);
 393                 bp_out = bp;
 394                 new_file_flag = 1;
 395                 n = 0;
 396               }
 397         }
 398     }
 399   while (n_read == bufsize);
 400 }
 401 \f
 402 /* Split into pieces that are as large as possible while still not more
 403    than NCHARS bytes, and are split on line boundaries except
 404    where lines longer than NCHARS bytes occur. */
 405
 406 void
 407 line_bytes_split (nchars)
 408     int nchars;
 409 {
 410   int n_read;
 411   char *bp;
 412   int eof = 0;
 413   int n_buffered = 0;
 414   char *buf = (char *) xmalloc (nchars);
 415
 416   do
 417     {
 418       /* Fill up the full buffer size from the input file.  */
 419
 420       n_read = stdread (buf + n_buffered, nchars - n_buffered);
 421       if (n_read < 0)
 422         error (1, errno, "%s", infile);
 423
 424       n_buffered += n_read;
 425       if (n_buffered != nchars)
 426         eof = 1;
 427
 428       /* Find where to end this chunk.  */
 429       bp = buf + n_buffered;
 430       if (n_buffered == nchars)
 431         {
 432           while (bp > buf && bp[-1] != '\n')
 433             bp--;
 434         }
 435
 436       /* If chunk has no newlines, use all the chunk.  */
 437       if (bp == buf)
 438         bp = buf + n_buffered;
 439
 440       /* Output the chars as one output file.  */
 441       cwrite (1, buf, bp - buf);
 442
 443       /* Discard the chars we just output; move rest of chunk
 444          down to be the start of the next chunk.  */
 445       n_buffered -= bp - buf;
 446       if (n_buffered > 0)
 447         bcopy (bp, buf, n_buffered);
 448     }
 449   while (!eof);
 450   free (buf);
 451 }
 452 \f
 453 /* Write BYTES bytes at BP to an output file.
 454    If NEW_FILE_FLAG is nonzero, open the next output file.
 455    Otherwise add to the same output file already in use.  */
 456
 457 void
 458 cwrite (new_file_flag, bp, bytes)
 459     int new_file_flag;
 460     char *bp;
 461     int bytes;
 462 {
 463   if (new_file_flag)
 464     {
 465       if (output_desc >= 0 && close (output_desc) < 0)
 466         error (1, errno, "%s", outfile);
 467
 468       next_file_name ();
 469       output_desc = open (outfile, O_WRONLY | O_CREAT | O_TRUNC, 0666);
 470       if (output_desc < 0)
 471         error (1, errno, "%s", outfile);
 472     }
 473   if (write (output_desc, bp, bytes) < 0)
 474     error (1, errno, "%s", outfile);
 475 }
 476
 477 /* Read NCHARS bytes from the input file into BUF.
 478    Return the number of bytes successfully read.
 479    If this is less than NCHARS, do not call `stdread' again.  */
 480
 481 int
 482 stdread (buf, nchars)
 483     char *buf;
 484     int nchars;
 485 {
 486   int n_read;
 487   int to_be_read = nchars;
 488
 489   while (to_be_read)
 490     {
 491       n_read = read (input_desc, buf, to_be_read);
 492       if (n_read < 0)
 493         return -1;
 494       if (n_read == 0)
 495         break;
 496       to_be_read -= n_read;
 497       buf += n_read;
 498     }
 499   return nchars - to_be_read;
 500 }
 501
 502 /* Compute the next sequential output file name suffix and store it
 503    into the string `outfile' at the position pointed to by `outfile_mid'.  */
 504
 505 void
 506 next_file_name ()
 507 {
 508   int x;
 509   char *ne;
 510
 511   outfile_count++;
 512   if (outfile_count < outfile_name_limit)
 513     {
 514       for (ne = outfile_end - 1; ; ne--)
 515         {
 516           x = *ne;
 517           if (x != 'z')
 518             break;
 519           *ne = 'a';
 520         }
 521       *ne = x + 1;
 522       return;
 523     }
 524
 525   outfile_count = 0;
 526   outfile_name_limit *= 26;
 527   outfile_name_generation++;
 528   *outfile_mid++ = 'z';
 529   for (x = 0; x <= outfile_name_generation; x++)
 530     outfile_mid[x] = 'a';
 531   outfile_end += 2;
 532 }