From b25038ce9a234ea0906ddcbd8a0012e917e6c661 Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Sun, 8 Nov 1992 02:50:43 +0000 Subject: [PATCH] Initial revision --- COPYING | 339 ++++ lib/bcopy.c | 19 + lib/linebuffer.c | 91 + lib/linebuffer.h | 42 + lib/memchr.c | 145 ++ lib/memset.c | 29 + lib/regex.c | 4870 +++++++++++++++++++++++++++++++++++++++++++++++ lib/regex.h | 481 +++++ old/textutils/ChangeLog | 855 +++++++++ src/cat.c | 660 +++++++ src/cksum.c | 274 +++ src/comm.c | 221 +++ src/csplit.c | 1308 +++++++++++++ src/cut.c | 586 ++++++ src/expand.c | 377 ++++ src/fold.c | 250 +++ src/head.c | 380 ++++ src/join.c | 690 +++++++ src/nl.c | 546 ++++++ src/od.c | 1697 +++++++++++++++++ src/paste.c | 458 +++++ src/pr.c | 1844 ++++++++++++++++++ src/sort.c | 1746 +++++++++++++++++ src/split.c | 532 ++++++ src/sum.c | 217 +++ src/tac.c | 628 ++++++ src/tail.c | 858 +++++++++ src/tr.c | 1813 ++++++++++++++++++ src/unexpand.c | 432 +++++ src/uniq.c | 321 ++++ src/wc.c | 231 +++ 31 files changed, 22940 insertions(+) create mode 100644 COPYING create mode 100644 lib/bcopy.c create mode 100644 lib/linebuffer.c create mode 100644 lib/linebuffer.h create mode 100644 lib/memchr.c create mode 100644 lib/memset.c create mode 100644 lib/regex.c create mode 100644 lib/regex.h create mode 100644 old/textutils/ChangeLog create mode 100644 src/cat.c create mode 100644 src/cksum.c create mode 100644 src/comm.c create mode 100644 src/csplit.c create mode 100644 src/cut.c create mode 100644 src/expand.c create mode 100644 src/fold.c create mode 100644 src/head.c create mode 100644 src/join.c create mode 100644 src/nl.c create mode 100644 src/od.c create mode 100644 src/paste.c create mode 100644 src/pr.c create mode 100644 src/sort.c create mode 100644 src/split.c create mode 100644 src/sum.c create mode 100644 src/tac.c create mode 100644 src/tail.c create mode 100644 src/tr.c create mode 100644 src/unexpand.c create mode 100644 src/uniq.c create mode 100644 src/wc.c diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..a43ea21 --- /dev/null +++ b/COPYING @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 675 Mass Ave, Cambridge, MA 02139, USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + Appendix: How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) 19yy + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) 19yy name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/lib/bcopy.c b/lib/bcopy.c new file mode 100644 index 0000000..a8991c5 --- /dev/null +++ b/lib/bcopy.c @@ -0,0 +1,19 @@ +/* bcopy.c -- copy memory. + Copy LENGTH bytes from SOURCE to DEST. Does not null-terminate. + In the public domain. + By David MacKenzie . */ + +void +bcopy (source, dest, length) + char *source, *dest; + unsigned length; +{ + if (source < dest) + /* Moving from low mem to hi mem; start at end. */ + for (source += length, dest += length; length; --length) + *--dest = *--source; + else if (source != dest) + /* Moving from hi mem to low mem; start at beginning. */ + for (; length; --length) + *dest++ = *source++; +} diff --git a/lib/linebuffer.c b/lib/linebuffer.c new file mode 100644 index 0000000..7f53aed --- /dev/null +++ b/lib/linebuffer.c @@ -0,0 +1,91 @@ +/* linebuffer.c -- read arbitrarily long lines + Copyright (C) 1986, 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* Written by Richard Stallman. */ + +#include +#include "linebuffer.h" + +char *xmalloc (); +char *xrealloc (); +void free (); + +/* Initialize linebuffer LINEBUFFER for use. */ + +void +initbuffer (linebuffer) + struct linebuffer *linebuffer; +{ + linebuffer->length = 0; + linebuffer->size = 200; + linebuffer->buffer = (char *) xmalloc (linebuffer->size); +} + +/* Read an arbitrarily long line of text from STREAM into LINEBUFFER. + Remove any newline. Does not null terminate. + Return LINEBUFFER, except at end of file return 0. */ + +struct linebuffer * +readline (linebuffer, stream) + struct linebuffer *linebuffer; + FILE *stream; +{ + int c; + char *buffer = linebuffer->buffer; + char *p = linebuffer->buffer; + char *end = buffer + linebuffer->size; /* Sentinel. */ + + if (feof (stream)) + { + linebuffer->length = 0; + return 0; + } + + while (1) + { + c = getc (stream); + if (p == end) + { + linebuffer->size *= 2; + buffer = (char *) xrealloc (buffer, linebuffer->size); + p += buffer - linebuffer->buffer; + linebuffer->buffer = buffer; + end = buffer + linebuffer->size; + } + if (c == EOF || c == '\n') + break; + *p++ = c; + } + + if (feof (stream) && p == buffer) + { + linebuffer->length = 0; + return 0; + } + linebuffer->length = p - linebuffer->buffer; + return linebuffer; +} + +/* Free linebuffer LINEBUFFER and its data, all allocated with malloc. */ + +void +freebuffer (linebuffer) + struct linebuffer *linebuffer; +{ + free (linebuffer->buffer); + free (linebuffer); +} diff --git a/lib/linebuffer.h b/lib/linebuffer.h new file mode 100644 index 0000000..13abe18 --- /dev/null +++ b/lib/linebuffer.h @@ -0,0 +1,42 @@ +/* linebuffer.h -- declarations for reading arbitrarily long lines + Copyright (C) 1986, 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* A `struct linebuffer' holds a line of text. */ + +struct linebuffer +{ + long size; /* Allocated. */ + long length; /* Used. */ + char *buffer; +}; + +#ifdef __STDC__ +/* Initialize linebuffer LINEBUFFER for use. */ +void initbuffer (struct linebuffer *linebuffer); + +/* Read an arbitrarily long line of text from STREAM into LINEBUFFER. + Remove any newline. Does not null terminate. + Return LINEBUFFER, except at end of file return 0. */ +struct linebuffer *readline (struct linebuffer *linebuffer, FILE *stream); + +/* Free linebuffer LINEBUFFER and its data, all allocated with malloc. */ +void freebuffer (struct linebuffer *); +#else +void initbuffer (); +struct linebuffer *readline (); +void freebuffer (); +#endif diff --git a/lib/memchr.c b/lib/memchr.c new file mode 100644 index 0000000..cb8d4a2 --- /dev/null +++ b/lib/memchr.c @@ -0,0 +1,145 @@ +/* Copyright (C) 1991 Free Software Foundation, Inc. + Based on strlen implemention by Torbjorn Granlund (tege@sics.se), + with help from Dan Sahlin (dan@sics.se) and + commentary by Jim Blandy (jimb@ai.mit.edu); + adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu), + and implemented by Roland McGrath (roland@ai.mit.edu). + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 675 Mass Ave, +Cambridge, MA 02139, USA. */ + + + + +/* Search no more than N bytes of S for C. */ + +char * +memchr(s, c, n) + unsigned char * s ; + int c ; + unsigned n; +{ + unsigned char *char_ptr; + unsigned long int *longword_ptr; + unsigned long int longword, magic_bits, charmask; + + c = (unsigned char) c; + + /* Handle the first few characters by reading one character at a time. + Do this until CHAR_PTR is aligned on a 4-byte border. */ + for (char_ptr = s; n > 0 && ((unsigned long int) char_ptr & 3) != 0; + --n, ++char_ptr) + if (*char_ptr == c) + return (char *) char_ptr; + + longword_ptr = (unsigned long int *) char_ptr; + + /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits + the "holes." Note that there is a hole just to the left of + each byte, with an extra at the end: + + bits: 01111110 11111110 11111110 11111111 + bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD + + The 1-bits make sure that carries propagate to the next 0-bit. + The 0-bits provide holes for carries to fall into. */ + magic_bits = 0x7efefeff; + + /* Set up a longword, each of whose bytes is C. */ + charmask = c | (c << 8); + charmask |= charmask << 16; + + /* Instead of the traditional loop which tests each character, + we will test a longword at a time. The tricky part is testing + if *any of the four* bytes in the longword in question are zero. */ + while (n >= 4) + { + /* We tentatively exit the loop if adding MAGIC_BITS to + LONGWORD fails to change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-30 is set, there will be a carry + into bit 31, so all of the hole bits will be changed. + + The one misfire occurs when bits 24-30 are clear and bit + 31 is set; in this case, the hole at bit 31 is not + changed. If we had access to the processor carry flag, + we could close this loophole by putting the fourth hole + at bit 32! + + So it ignores everything except 128's, when they're aligned + properly. + + 3) But wait! Aren't we looking for C, not zero? + Good point. So what we do is XOR LONGWORD with a longword, + each of whose bytes is C. This turns each byte that is C + into a zero. */ + + longword = *longword_ptr++ ^ charmask; + + /* Add MAGIC_BITS to LONGWORD. */ + if ((((longword + magic_bits) + + /* Set those bits that were unchanged by the addition. */ + ^ ~longword) + + /* Look at only the hole bits. If any of the hole bits + are unchanged, most likely one of the bytes was a + zero. */ + & ~magic_bits) != 0) + { + /* Which of the bytes was C? If none of them were, it was + a misfire; continue the search. */ + + unsigned char *cp = ( unsigned char *) (longword_ptr - 1); + + if (cp[0] == c) + return (char *) cp; + if (cp[1] == c) + return (char *) &cp[1]; + if (cp[2] == c) + return (char *) &cp[2]; + if (cp[3] == c) + return (char *) &cp[3]; + } + + n -= 4; + } + + char_ptr = ( unsigned char *) longword_ptr; + + while (n-- > 0) + { + if (*char_ptr == c) + return (char *) char_ptr; + else + ++char_ptr; + } + + return 0; +} diff --git a/lib/memset.c b/lib/memset.c new file mode 100644 index 0000000..0e819f2 --- /dev/null +++ b/lib/memset.c @@ -0,0 +1,29 @@ +/* memset.c -- set an area of memory to a given value + Copyright (C) 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +char * +memset (str, c, len) + char *str; + int c; + unsigned len; +{ + register char *st = str; + + while (len-- > 0) + *st++ = c; + return str; +} diff --git a/lib/regex.c b/lib/regex.c new file mode 100644 index 0000000..3129ed4 --- /dev/null +++ b/lib/regex.c @@ -0,0 +1,4870 @@ +/* Extended regular expression matching and search library, + version 0.11. + (Implements POSIX draft P10003.2/D11.2, except for + internationalization features.) + + Copyright (C) 1985, 89, 90, 91, 92 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* AIX requires this to be the first thing in the file. */ +#if defined (_AIX) && !defined (REGEX_MALLOC) + #pragma alloca +#endif + +#define _GNU_SOURCE + +/* We need this for `regex.h', and perhaps for the Emacs include files. */ +#include + +/* The `emacs' switch turns on certain matching commands + that make sense only in Emacs. */ +#ifdef emacs + +#include "config.h" +#include "lisp.h" +#include "buffer.h" +#include "syntax.h" + +/* Emacs uses `NULL' as a predicate. */ +#undef NULL + +#else /* not emacs */ + +/* We used to test for `BSTRING' here, but only GCC and Emacs define + `BSTRING', as far as I know, and neither of them use this code. */ +#if USG || STDC_HEADERS +#include +#define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) +#define bcopy(s, d, n) memcpy ((d), (s), (n)) +#define bzero(s, n) memset ((s), 0, (n)) +#else +#include +#endif + +#ifdef STDC_HEADERS +#include +#else +char *malloc (); +char *realloc (); +#endif + + +/* Define the syntax stuff for \<, \>, etc. */ + +/* This must be nonzero for the wordchar and notwordchar pattern + commands in re_match_2. */ +#ifndef Sword +#define Sword 1 +#endif + +#ifdef SYNTAX_TABLE + +extern char *re_syntax_table; + +#else /* not SYNTAX_TABLE */ + +/* How many characters in the character set. */ +#define CHAR_SET_SIZE 256 + +static char re_syntax_table[CHAR_SET_SIZE]; + +static void +init_syntax_once () +{ + register int c; + static int done = 0; + + if (done) + return; + + bzero (re_syntax_table, sizeof re_syntax_table); + + for (c = 'a'; c <= 'z'; c++) + re_syntax_table[c] = Sword; + + for (c = 'A'; c <= 'Z'; c++) + re_syntax_table[c] = Sword; + + for (c = '0'; c <= '9'; c++) + re_syntax_table[c] = Sword; + + re_syntax_table['_'] = Sword; + + done = 1; +} + +#endif /* not SYNTAX_TABLE */ + +#define SYNTAX(c) re_syntax_table[c] + +#endif /* not emacs */ + +/* Get the interface, including the syntax bits. */ +#include "regex.h" + + +/* isalpha etc. are used for the character classes. */ +#include +#ifndef isgraph +#define isgraph(c) (isprint (c) && !isspace (c)) +#endif +#ifndef isblank +#define isblank(c) ((c) == ' ' || (c) == '\t') +#endif + +#ifndef NULL +#define NULL 0 +#endif + +/* We remove any previous definition of `SIGN_EXTEND_CHAR', + since ours (we hope) works properly with all combinations of + machines, compilers, `char' and `unsigned char' argument types. + (Per Bothner suggested the basic approach.) */ +#undef SIGN_EXTEND_CHAR +#if __STDC__ +#define SIGN_EXTEND_CHAR(c) ((signed char) (c)) +#else +/* As in Harbison and Steele. */ +#define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) +#endif + +/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we + use `alloca' instead of `malloc'. This is because using malloc in + re_search* or re_match* could cause memory leaks when C-g is used in + Emacs; also, malloc is slower and causes storage fragmentation. On + the other hand, malloc is more portable, and easier to debug. + + Because we sometimes use alloca, some routines have to be macros, + not functions -- `alloca'-allocated space disappears at the end of the + function it is called in. */ + +#ifdef REGEX_MALLOC + +#define REGEX_ALLOCATE malloc +#define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) + +#else /* not REGEX_MALLOC */ + +/* Emacs already defines alloca, sometimes. */ +#ifndef alloca + +/* Make alloca work the best possible way. */ +#ifdef __GNUC__ +#define alloca __builtin_alloca +#else /* not __GNUC__ */ +#if HAVE_ALLOCA_H +#include +#else /* not __GNUC__ or HAVE_ALLOCA_H */ +#ifndef _AIX /* Already did AIX, up at the top. */ +char *alloca (); +#endif /* not _AIX */ +#endif /* not HAVE_ALLOCA_H */ +#endif /* not __GNUC__ */ + +#endif /* not alloca */ + +#define REGEX_ALLOCATE alloca + +/* Assumes a `char *destination' variable. */ +#define REGEX_REALLOCATE(source, osize, nsize) \ + (destination = (char *) alloca (nsize), \ + bcopy (source, destination, osize), \ + destination) + +#endif /* not REGEX_MALLOC */ + + +/* True if `size1' is non-NULL and PTR is pointing anywhere inside + `string1' or just past its end. This works if PTR is NULL, which is + a good thing. */ +#define FIRST_STRING_P(ptr) \ + (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) + +/* (Re)Allocate N items of type T using malloc, or fail. */ +#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) +#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) +#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) + +#define BYTEWIDTH 8 /* In bits. */ + +#define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +typedef char boolean; +#define false 0 +#define true 1 + +/* These are the command codes that appear in compiled regular + expressions. Some opcodes are followed by argument bytes. A + command code can specify any interpretation whatsoever for its + arguments. Zero bytes may appear in the compiled regular expression. + + The value of `exactn' is needed in search.c (search_buffer) in Emacs. + So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of + `exactn' we use here must also be 1. */ + +typedef enum +{ + no_op = 0, + + /* Followed by one byte giving n, then by n literal bytes. */ + exactn = 1, + + /* Matches any (more or less) character. */ + anychar, + + /* Matches any one char belonging to specified set. First + following byte is number of bitmap bytes. Then come bytes + for a bitmap saying which chars are in. Bits in each byte + are ordered low-bit-first. A character is in the set if its + bit is 1. A character too large to have a bit in the map is + automatically not in the set. */ + charset, + + /* Same parameters as charset, but match any character that is + not one of those specified. */ + charset_not, + + /* Start remembering the text that is matched, for storing in a + register. Followed by one byte with the register number, in + the range 0 to one less than the pattern buffer's re_nsub + field. Then followed by one byte with the number of groups + inner to this one. (This last has to be part of the + start_memory only because we need it in the on_failure_jump + of re_match_2.) */ + start_memory, + + /* Stop remembering the text that is matched and store it in a + memory register. Followed by one byte with the register + number, in the range 0 to one less than `re_nsub' in the + pattern buffer, and one byte with the number of inner groups, + just like `start_memory'. (We need the number of inner + groups here because we don't have any easy way of finding the + corresponding start_memory when we're at a stop_memory.) */ + stop_memory, + + /* Match a duplicate of something remembered. Followed by one + byte containing the register number. */ + duplicate, + + /* Fail unless at beginning of line. */ + begline, + + /* Fail unless at end of line. */ + endline, + + /* Succeeds if at beginning of buffer (if emacs) or at beginning + of string to be matched (if not). */ + begbuf, + + /* Analogously, for end of buffer/string. */ + endbuf, + + /* Followed by two byte relative address to which to jump. */ + jump, + + /* Same as jump, but marks the end of an alternative. */ + jump_past_alt, + + /* Followed by two-byte relative address of place to resume at + in case of failure. */ + on_failure_jump, + + /* Like on_failure_jump, but pushes a placeholder instead of the + current string position when executed. */ + on_failure_keep_string_jump, + + /* Throw away latest failure point and then jump to following + two-byte relative address. */ + pop_failure_jump, + + /* Change to pop_failure_jump if know won't have to backtrack to + match; otherwise change to jump. This is used to jump + back to the beginning of a repeat. If what follows this jump + clearly won't match what the repeat does, such that we can be + sure that there is no use backtracking out of repetitions + already matched, then we change it to a pop_failure_jump. + Followed by two-byte address. */ + maybe_pop_jump, + + /* Jump to following two-byte address, and push a dummy failure + point. This failure point will be thrown away if an attempt + is made to use it for a failure. A `+' construct makes this + before the first repeat. Also used as an intermediary kind + of jump when compiling an alternative. */ + dummy_failure_jump, + + /* Push a dummy failure point and continue. Used at the end of + alternatives. */ + push_dummy_failure, + + /* Followed by two-byte relative address and two-byte number n. + After matching N times, jump to the address upon failure. */ + succeed_n, + + /* Followed by two-byte relative address, and two-byte number n. + Jump to the address N times, then fail. */ + jump_n, + + /* Set the following two-byte relative address to the + subsequent two-byte number. The address *includes* the two + bytes of number. */ + set_number_at, + + wordchar, /* Matches any word-constituent character. */ + notwordchar, /* Matches any char that is not a word-constituent. */ + + wordbeg, /* Succeeds if at word beginning. */ + wordend, /* Succeeds if at word end. */ + + wordbound, /* Succeeds if at a word boundary. */ + notwordbound /* Succeeds if not at a word boundary. */ + +#ifdef emacs + ,before_dot, /* Succeeds if before point. */ + at_dot, /* Succeeds if at point. */ + after_dot, /* Succeeds if after point. */ + + /* Matches any character whose syntax is specified. Followed by + a byte which contains a syntax code, e.g., Sword. */ + syntaxspec, + + /* Matches any character whose syntax is not that specified. */ + notsyntaxspec +#endif /* emacs */ +} re_opcode_t; + +/* Common operations on the compiled pattern. */ + +/* Store NUMBER in two contiguous bytes starting at DESTINATION. */ + +#define STORE_NUMBER(destination, number) \ + do { \ + (destination)[0] = (number) & 0377; \ + (destination)[1] = (number) >> 8; \ + } while (0) + +/* Same as STORE_NUMBER, except increment DESTINATION to + the byte after where the number is stored. Therefore, DESTINATION + must be an lvalue. */ + +#define STORE_NUMBER_AND_INCR(destination, number) \ + do { \ + STORE_NUMBER (destination, number); \ + (destination) += 2; \ + } while (0) + +/* Put into DESTINATION a number stored in two contiguous bytes starting + at SOURCE. */ + +#define EXTRACT_NUMBER(destination, source) \ + do { \ + (destination) = *(source) & 0377; \ + (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ + } while (0) + +#ifdef DEBUG +static void +extract_number (dest, source) + int *dest; + unsigned char *source; +{ + int temp = SIGN_EXTEND_CHAR (*(source + 1)); + *dest = *source & 0377; + *dest += temp << 8; +} + +#ifndef EXTRACT_MACROS /* To debug the macros. */ +#undef EXTRACT_NUMBER +#define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) +#endif /* not EXTRACT_MACROS */ + +#endif /* DEBUG */ + +/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. + SOURCE must be an lvalue. */ + +#define EXTRACT_NUMBER_AND_INCR(destination, source) \ + do { \ + EXTRACT_NUMBER (destination, source); \ + (source) += 2; \ + } while (0) + +#ifdef DEBUG +static void +extract_number_and_incr (destination, source) + int *destination; + unsigned char **source; +{ + extract_number (destination, *source); + *source += 2; +} + +#ifndef EXTRACT_MACROS +#undef EXTRACT_NUMBER_AND_INCR +#define EXTRACT_NUMBER_AND_INCR(dest, src) \ + extract_number_and_incr (&dest, &src) +#endif /* not EXTRACT_MACROS */ + +#endif /* DEBUG */ + +/* If DEBUG is defined, Regex prints many voluminous messages about what + it is doing (if the variable `debug' is nonzero). If linked with the + main program in `iregex.c', you can enter patterns and strings + interactively. And if linked with the main program in `main.c' and + the other test files, you can run the already-written tests. */ + +#ifdef DEBUG + +/* We use standard I/O for debugging. */ +#include + +/* It is useful to test things that ``must'' be true when debugging. */ +#include + +static int debug = 0; + +#define DEBUG_STATEMENT(e) e +#define DEBUG_PRINT1(x) if (debug) printf (x) +#define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) +#define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) +#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ + if (debug) print_partial_compiled_pattern (s, e) +#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ + if (debug) print_double_string (w, s1, sz1, s2, sz2) + + +extern void printchar (); + +/* Print the fastmap in human-readable form. */ + +void +print_fastmap (fastmap) + char *fastmap; +{ + unsigned was_a_range = 0; + unsigned i = 0; + + while (i < (1 << BYTEWIDTH)) + { + if (fastmap[i++]) + { + was_a_range = 0; + printchar (i - 1); + while (i < (1 << BYTEWIDTH) && fastmap[i]) + { + was_a_range = 1; + i++; + } + if (was_a_range) + { + printf ("-"); + printchar (i - 1); + } + } + } + putchar ('\n'); +} + + +/* Print a compiled pattern string in human-readable form, starting at + the START pointer into it and ending just before the pointer END. */ + +void +print_partial_compiled_pattern (start, end) + unsigned char *start; + unsigned char *end; +{ + int mcnt, mcnt2; + unsigned char *p = start; + unsigned char *pend = end; + + if (start == NULL) + { + printf ("(null)\n"); + return; + } + + /* Loop over pattern commands. */ + while (p < pend) + { + switch ((re_opcode_t) *p++) + { + case no_op: + printf ("/no_op"); + break; + + case exactn: + mcnt = *p++; + printf ("/exactn/%d", mcnt); + do + { + putchar ('/'); + printchar (*p++); + } + while (--mcnt); + break; + + case start_memory: + mcnt = *p++; + printf ("/start_memory/%d/%d", mcnt, *p++); + break; + + case stop_memory: + mcnt = *p++; + printf ("/stop_memory/%d/%d", mcnt, *p++); + break; + + case duplicate: + printf ("/duplicate/%d", *p++); + break; + + case anychar: + printf ("/anychar"); + break; + + case charset: + case charset_not: + { + register int c; + + printf ("/charset%s", + (re_opcode_t) *(p - 1) == charset_not ? "_not" : ""); + + assert (p + *p < pend); + + for (c = 0; c < *p; c++) + { + unsigned bit; + unsigned char map_byte = p[1 + c]; + + putchar ('/'); + + for (bit = 0; bit < BYTEWIDTH; bit++) + if (map_byte & (1 << bit)) + printchar (c * BYTEWIDTH + bit); + } + p += 1 + *p; + break; + } + + case begline: + printf ("/begline"); + break; + + case endline: + printf ("/endline"); + break; + + case on_failure_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/on_failure_jump/0/%d", mcnt); + break; + + case on_failure_keep_string_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/on_failure_keep_string_jump/0/%d", mcnt); + break; + + case dummy_failure_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/dummy_failure_jump/0/%d", mcnt); + break; + + case push_dummy_failure: + printf ("/push_dummy_failure"); + break; + + case maybe_pop_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/maybe_pop_jump/0/%d", mcnt); + break; + + case pop_failure_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/pop_failure_jump/0/%d", mcnt); + break; + + case jump_past_alt: + extract_number_and_incr (&mcnt, &p); + printf ("/jump_past_alt/0/%d", mcnt); + break; + + case jump: + extract_number_and_incr (&mcnt, &p); + printf ("/jump/0/%d", mcnt); + break; + + case succeed_n: + extract_number_and_incr (&mcnt, &p); + extract_number_and_incr (&mcnt2, &p); + printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2); + break; + + case jump_n: + extract_number_and_incr (&mcnt, &p); + extract_number_and_incr (&mcnt2, &p); + printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2); + break; + + case set_number_at: + extract_number_and_incr (&mcnt, &p); + extract_number_and_incr (&mcnt2, &p); + printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2); + break; + + case wordbound: + printf ("/wordbound"); + break; + + case notwordbound: + printf ("/notwordbound"); + break; + + case wordbeg: + printf ("/wordbeg"); + break; + + case wordend: + printf ("/wordend"); + +#ifdef emacs + case before_dot: + printf ("/before_dot"); + break; + + case at_dot: + printf ("/at_dot"); + break; + + case after_dot: + printf ("/after_dot"); + break; + + case syntaxspec: + printf ("/syntaxspec"); + mcnt = *p++; + printf ("/%d", mcnt); + break; + + case notsyntaxspec: + printf ("/notsyntaxspec"); + mcnt = *p++; + printf ("/%d", mcnt); + break; +#endif /* emacs */ + + case wordchar: + printf ("/wordchar"); + break; + + case notwordchar: + printf ("/notwordchar"); + break; + + case begbuf: + printf ("/begbuf"); + break; + + case endbuf: + printf ("/endbuf"); + break; + + default: + printf ("?%d", *(p-1)); + } + } + printf ("/\n"); +} + + +void +print_compiled_pattern (bufp) + struct re_pattern_buffer *bufp; +{ + unsigned char *buffer = bufp->buffer; + + print_partial_compiled_pattern (buffer, buffer + bufp->used); + printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated); + + if (bufp->fastmap_accurate && bufp->fastmap) + { + printf ("fastmap: "); + print_fastmap (bufp->fastmap); + } + + printf ("re_nsub: %d\t", bufp->re_nsub); + printf ("regs_alloc: %d\t", bufp->regs_allocated); + printf ("can_be_null: %d\t", bufp->can_be_null); + printf ("newline_anchor: %d\n", bufp->newline_anchor); + printf ("no_sub: %d\t", bufp->no_sub); + printf ("not_bol: %d\t", bufp->not_bol); + printf ("not_eol: %d\t", bufp->not_eol); + printf ("syntax: %d\n", bufp->syntax); + /* Perhaps we should print the translate table? */ +} + + +void +print_double_string (where, string1, size1, string2, size2) + const char *where; + const char *string1; + const char *string2; + int size1; + int size2; +{ + unsigned this_char; + + if (where == NULL) + printf ("(null)"); + else + { + if (FIRST_STRING_P (where)) + { + for (this_char = where - string1; this_char < size1; this_char++) + printchar (string1[this_char]); + + where = string2; + } + + for (this_char = where - string2; this_char < size2; this_char++) + printchar (string2[this_char]); + } +} + +#else /* not DEBUG */ + +#undef assert +#define assert(e) + +#define DEBUG_STATEMENT(e) +#define DEBUG_PRINT1(x) +#define DEBUG_PRINT2(x1, x2) +#define DEBUG_PRINT3(x1, x2, x3) +#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) +#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) + +#endif /* not DEBUG */ + +/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can + also be assigned to arbitrarily: each pattern buffer stores its own + syntax, so it can be changed between regex compilations. */ +reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS; + + +/* Specify the precise syntax of regexps for compilation. This provides + for compatibility for various utilities which historically have + different, incompatible syntaxes. + + The argument SYNTAX is a bit mask comprised of the various bits + defined in regex.h. We return the old syntax. */ + +reg_syntax_t +re_set_syntax (syntax) + reg_syntax_t syntax; +{ + reg_syntax_t ret = re_syntax_options; + + re_syntax_options = syntax; + return ret; +} + +/* This table gives an error message for each of the error codes listed + in regex.h. Obviously the order here has to be same as there. */ + +static const char *re_error_msg[] = + { NULL, /* REG_NOERROR */ + "No match", /* REG_NOMATCH */ + "Invalid regular expression", /* REG_BADPAT */ + "Invalid collation character", /* REG_ECOLLATE */ + "Invalid character class name", /* REG_ECTYPE */ + "Trailing backslash", /* REG_EESCAPE */ + "Invalid back reference", /* REG_ESUBREG */ + "Unmatched [ or [^", /* REG_EBRACK */ + "Unmatched ( or \\(", /* REG_EPAREN */ + "Unmatched \\{", /* REG_EBRACE */ + "Invalid content of \\{\\}", /* REG_BADBR */ + "Invalid range end", /* REG_ERANGE */ + "Memory exhausted", /* REG_ESPACE */ + "Invalid preceding regular expression", /* REG_BADRPT */ + "Premature end of regular expression", /* REG_EEND */ + "Regular expression too big", /* REG_ESIZE */ + "Unmatched ) or \\)", /* REG_ERPAREN */ + }; + +/* Subroutine declarations and macros for regex_compile. */ + +static void store_op1 (), store_op2 (); +static void insert_op1 (), insert_op2 (); +static boolean at_begline_loc_p (), at_endline_loc_p (); +static boolean group_in_compile_stack (); +static reg_errcode_t compile_range (); + +/* Fetch the next character in the uncompiled pattern---translating it + if necessary. Also cast from a signed character in the constant + string passed to us by the user to an unsigned char that we can use + as an array index (in, e.g., `translate'). */ +#define PATFETCH(c) \ + do {if (p == pend) return REG_EEND; \ + c = (unsigned char) *p++; \ + if (translate) c = translate[c]; \ + } while (0) + +/* Fetch the next character in the uncompiled pattern, with no + translation. */ +#define PATFETCH_RAW(c) \ + do {if (p == pend) return REG_EEND; \ + c = (unsigned char) *p++; \ + } while (0) + +/* Go backwards one character in the pattern. */ +#define PATUNFETCH p-- + + +/* If `translate' is non-null, return translate[D], else just D. We + cast the subscript to translate because some data is declared as + `char *', to avoid warnings when a string constant is passed. But + when we use a character as a subscript we must make it unsigned. */ +#define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d)) + + +/* Macros for outputting the compiled pattern into `buffer'. */ + +/* If the buffer isn't allocated when it comes in, use this. */ +#define INIT_BUF_SIZE 32 + +/* Make sure we have at least N more bytes of space in buffer. */ +#define GET_BUFFER_SPACE(n) \ + while (b - bufp->buffer + (n) > bufp->allocated) \ + EXTEND_BUFFER () + +/* Make sure we have one more byte of buffer space and then add C to it. */ +#define BUF_PUSH(c) \ + do { \ + GET_BUFFER_SPACE (1); \ + *b++ = (unsigned char) (c); \ + } while (0) + + +/* Ensure we have two more bytes of buffer space and then append C1 and C2. */ +#define BUF_PUSH_2(c1, c2) \ + do { \ + GET_BUFFER_SPACE (2); \ + *b++ = (unsigned char) (c1); \ + *b++ = (unsigned char) (c2); \ + } while (0) + + +/* As with BUF_PUSH_2, except for three bytes. */ +#define BUF_PUSH_3(c1, c2, c3) \ + do { \ + GET_BUFFER_SPACE (3); \ + *b++ = (unsigned char) (c1); \ + *b++ = (unsigned char) (c2); \ + *b++ = (unsigned char) (c3); \ + } while (0) + + +/* Store a jump with opcode OP at LOC to location TO. We store a + relative address offset by the three bytes the jump itself occupies. */ +#define STORE_JUMP(op, loc, to) \ + store_op1 (op, loc, (to) - (loc) - 3) + +/* Likewise, for a two-argument jump. */ +#define STORE_JUMP2(op, loc, to, arg) \ + store_op2 (op, loc, (to) - (loc) - 3, arg) + +/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ +#define INSERT_JUMP(op, loc, to) \ + insert_op1 (op, loc, (to) - (loc) - 3, b) + +/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ +#define INSERT_JUMP2(op, loc, to, arg) \ + insert_op2 (op, loc, (to) - (loc) - 3, arg, b) + + +/* This is not an arbitrary limit: the arguments which represent offsets + into the pattern are two bytes long. So if 2^16 bytes turns out to + be too small, many things would have to change. */ +#define MAX_BUF_SIZE (1L << 16) + + +/* Extend the buffer by twice its current size via realloc and + reset the pointers that pointed into the old block to point to the + correct places in the new one. If extending the buffer results in it + being larger than MAX_BUF_SIZE, then flag memory exhausted. */ +#define EXTEND_BUFFER() \ + do { \ + unsigned char *old_buffer = bufp->buffer; \ + if (bufp->allocated == MAX_BUF_SIZE) \ + return REG_ESIZE; \ + bufp->allocated <<= 1; \ + if (bufp->allocated > MAX_BUF_SIZE) \ + bufp->allocated = MAX_BUF_SIZE; \ + bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\ + if (bufp->buffer == NULL) \ + return REG_ESPACE; \ + /* If the buffer moved, move all the pointers into it. */ \ + if (old_buffer != bufp->buffer) \ + { \ + b = (b - old_buffer) + bufp->buffer; \ + begalt = (begalt - old_buffer) + bufp->buffer; \ + if (fixup_alt_jump) \ + fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ + if (laststart) \ + laststart = (laststart - old_buffer) + bufp->buffer; \ + if (pending_exact) \ + pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ + } \ + } while (0) + + +/* Since we have one byte reserved for the register number argument to + {start,stop}_memory, the maximum number of groups we can report + things about is what fits in that byte. */ +#define MAX_REGNUM 255 + +/* But patterns can have more than `MAX_REGNUM' registers. We just + ignore the excess. */ +typedef unsigned regnum_t; + + +/* Macros for the compile stack. */ + +/* Since offsets can go either forwards or backwards, this type needs to + be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ +typedef int pattern_offset_t; + +typedef struct +{ + pattern_offset_t begalt_offset; + pattern_offset_t fixup_alt_jump; + pattern_offset_t inner_group_offset; + pattern_offset_t laststart_offset; + regnum_t regnum; +} compile_stack_elt_t; + + +typedef struct +{ + compile_stack_elt_t *stack; + unsigned size; + unsigned avail; /* Offset of next open position. */ +} compile_stack_type; + + +#define INIT_COMPILE_STACK_SIZE 32 + +#define COMPILE_STACK_EMPTY (compile_stack.avail == 0) +#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) + +/* The next available element. */ +#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) + + +/* Set the bit for character C in a list. */ +#define SET_LIST_BIT(c) \ + (b[((unsigned char) (c)) / BYTEWIDTH] \ + |= 1 << (((unsigned char) c) % BYTEWIDTH)) + + +/* Get the next unsigned number in the uncompiled pattern. */ +#define GET_UNSIGNED_NUMBER(num) \ + { if (p != pend) \ + { \ + PATFETCH (c); \ + while (isdigit (c)) \ + { \ + if (num < 0) \ + num = 0; \ + num = num * 10 + c - '0'; \ + if (p == pend) \ + break; \ + PATFETCH (c); \ + } \ + } \ + } + +#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ + +#define IS_CHAR_CLASS(string) \ + (STREQ (string, "alpha") || STREQ (string, "upper") \ + || STREQ (string, "lower") || STREQ (string, "digit") \ + || STREQ (string, "alnum") || STREQ (string, "xdigit") \ + || STREQ (string, "space") || STREQ (string, "print") \ + || STREQ (string, "punct") || STREQ (string, "graph") \ + || STREQ (string, "cntrl") || STREQ (string, "blank")) + +/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. + Returns one of error codes defined in `regex.h', or zero for success. + + Assumes the `allocated' (and perhaps `buffer') and `translate' + fields are set in BUFP on entry. + + If it succeeds, results are put in BUFP (if it returns an error, the + contents of BUFP are undefined): + `buffer' is the compiled pattern; + `syntax' is set to SYNTAX; + `used' is set to the length of the compiled pattern; + `fastmap_accurate' is set to zero; + `re_nsub' is set to the number of groups in PATTERN; + `not_bol' and `not_eol' are set to zero. + + The `fastmap' and `newline_anchor' fields are neither + examined nor set. */ + +static reg_errcode_t +regex_compile (pattern, size, syntax, bufp) + const char *pattern; + int size; + reg_syntax_t syntax; + struct re_pattern_buffer *bufp; +{ + /* We fetch characters from PATTERN here. Even though PATTERN is + `char *' (i.e., signed), we declare these variables as unsigned, so + they can be reliably used as array indices. */ + register unsigned char c, c1; + + /* A random tempory spot in PATTERN. */ + const char *p1; + + /* Points to the end of the buffer, where we should append. */ + register unsigned char *b; + + /* Keeps track of unclosed groups. */ + compile_stack_type compile_stack; + + /* Points to the current (ending) position in the pattern. */ + const char *p = pattern; + const char *pend = pattern + size; + + /* How to translate the characters in the pattern. */ + char *translate = bufp->translate; + + /* Address of the count-byte of the most recently inserted `exactn' + command. This makes it possible to tell if a new exact-match + character can be added to that command or if the character requires + a new `exactn' command. */ + unsigned char *pending_exact = 0; + + /* Address of start of the most recently finished expression. + This tells, e.g., postfix * where to find the start of its + operand. Reset at the beginning of groups and alternatives. */ + unsigned char *laststart = 0; + + /* Address of beginning of regexp, or inside of last group. */ + unsigned char *begalt; + + /* Place in the uncompiled pattern (i.e., the {) to + which to go back if the interval is invalid. */ + const char *beg_interval; + + /* Address of the place where a forward jump should go to the end of + the containing expression. Each alternative of an `or' -- except the + last -- ends with a forward jump of this sort. */ + unsigned char *fixup_alt_jump = 0; + + /* Counts open-groups as they are encountered. Remembered for the + matching close-group on the compile stack, so the same register + number is put in the stop_memory as the start_memory. */ + regnum_t regnum = 0; + +#ifdef DEBUG + DEBUG_PRINT1 ("\nCompiling pattern: "); + if (debug) + { + unsigned debug_count; + + for (debug_count = 0; debug_count < size; debug_count++) + printchar (pattern[debug_count]); + putchar ('\n'); + } +#endif /* DEBUG */ + + /* Initialize the compile stack. */ + compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); + if (compile_stack.stack == NULL) + return REG_ESPACE; + + compile_stack.size = INIT_COMPILE_STACK_SIZE; + compile_stack.avail = 0; + + /* Initialize the pattern buffer. */ + bufp->syntax = syntax; + bufp->fastmap_accurate = 0; + bufp->not_bol = bufp->not_eol = 0; + + /* Set `used' to zero, so that if we return an error, the pattern + printer (for debugging) will think there's no pattern. We reset it + at the end. */ + bufp->used = 0; + + /* Always count groups, whether or not bufp->no_sub is set. */ + bufp->re_nsub = 0; + +#if !defined (emacs) && !defined (SYNTAX_TABLE) + /* Initialize the syntax table. */ + init_syntax_once (); +#endif + + if (bufp->allocated == 0) + { + if (bufp->buffer) + { /* If zero allocated, but buffer is non-null, try to realloc + enough space. This loses if buffer's address is bogus, but + that is the user's responsibility. */ + RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); + } + else + { /* Caller did not allocate a buffer. Do it for them. */ + bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); + } + if (!bufp->buffer) return REG_ESPACE; + + bufp->allocated = INIT_BUF_SIZE; + } + + begalt = b = bufp->buffer; + + /* Loop through the uncompiled pattern until we're at the end. */ + while (p != pend) + { + PATFETCH (c); + + switch (c) + { + case '^': + { + if ( /* If at start of pattern, it's an operator. */ + p == pattern + 1 + /* If context independent, it's an operator. */ + || syntax & RE_CONTEXT_INDEP_ANCHORS + /* Otherwise, depends on what's come before. */ + || at_begline_loc_p (pattern, p, syntax)) + BUF_PUSH (begline); + else + goto normal_char; + } + break; + + + case '$': + { + if ( /* If at end of pattern, it's an operator. */ + p == pend + /* If context independent, it's an operator. */ + || syntax & RE_CONTEXT_INDEP_ANCHORS + /* Otherwise, depends on what's next. */ + || at_endline_loc_p (p, pend, syntax)) + BUF_PUSH (endline); + else + goto normal_char; + } + break; + + + case '+': + case '?': + if ((syntax & RE_BK_PLUS_QM) + || (syntax & RE_LIMITED_OPS)) + goto normal_char; + handle_plus: + case '*': + /* If there is no previous pattern... */ + if (!laststart) + { + if (syntax & RE_CONTEXT_INVALID_OPS) + return REG_BADRPT; + else if (!(syntax & RE_CONTEXT_INDEP_OPS)) + goto normal_char; + } + + { + /* Are we optimizing this jump? */ + boolean keep_string_p = false; + + /* 1 means zero (many) matches is allowed. */ + char zero_times_ok = 0, many_times_ok = 0; + + /* If there is a sequence of repetition chars, collapse it + down to just one (the right one). We can't combine + interval operators with these because of, e.g., `a{2}*', + which should only match an even number of `a's. */ + + for (;;) + { + zero_times_ok |= c != '+'; + many_times_ok |= c != '?'; + + if (p == pend) + break; + + PATFETCH (c); + + if (c == '*' + || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) + ; + + else if (syntax & RE_BK_PLUS_QM && c == '\\') + { + if (p == pend) return REG_EESCAPE; + + PATFETCH (c1); + if (!(c1 == '+' || c1 == '?')) + { + PATUNFETCH; + PATUNFETCH; + break; + } + + c = c1; + } + else + { + PATUNFETCH; + break; + } + + /* If we get here, we found another repeat character. */ + } + + /* Star, etc. applied to an empty pattern is equivalent + to an empty pattern. */ + if (!laststart) + break; + + /* Now we know whether or not zero matches is allowed + and also whether or not two or more matches is allowed. */ + if (many_times_ok) + { /* More than one repetition is allowed, so put in at the + end a backward relative jump from `b' to before the next + jump we're going to put in below (which jumps from + laststart to after this jump). + + But if we are at the `*' in the exact sequence `.*\n', + insert an unconditional jump backwards to the ., + instead of the beginning of the loop. This way we only + push a failure point once, instead of every time + through the loop. */ + assert (p - 1 > pattern); + + /* Allocate the space for the jump. */ + GET_BUFFER_SPACE (3); + + /* We know we are not at the first character of the pattern, + because laststart was nonzero. And we've already + incremented `p', by the way, to be the character after + the `*'. Do we have to do something analogous here + for null bytes, because of RE_DOT_NOT_NULL? */ + if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') + && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') + && !(syntax & RE_DOT_NEWLINE)) + { /* We have .*\n. */ + STORE_JUMP (jump, b, laststart); + keep_string_p = true; + } + else + /* Anything else. */ + STORE_JUMP (maybe_pop_jump, b, laststart - 3); + + /* We've added more stuff to the buffer. */ + b += 3; + } + + /* On failure, jump from laststart to b + 3, which will be the + end of the buffer after this jump is inserted. */ + GET_BUFFER_SPACE (3); + INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump + : on_failure_jump, + laststart, b + 3); + pending_exact = 0; + b += 3; + + if (!zero_times_ok) + { + /* At least one repetition is required, so insert a + `dummy_failure_jump' before the initial + `on_failure_jump' instruction of the loop. This + effects a skip over that instruction the first time + we hit that loop. */ + GET_BUFFER_SPACE (3); + INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); + b += 3; + } + } + break; + + + case '.': + laststart = b; + BUF_PUSH (anychar); + break; + + + case '[': + { + boolean had_char_class = false; + + if (p == pend) return REG_EBRACK; + + /* Ensure that we have enough space to push a charset: the + opcode, the length count, and the bitset; 34 bytes in all. */ + GET_BUFFER_SPACE (34); + + laststart = b; + + /* We test `*p == '^' twice, instead of using an if + statement, so we only need one BUF_PUSH. */ + BUF_PUSH (*p == '^' ? charset_not : charset); + if (*p == '^') + p++; + + /* Remember the first position in the bracket expression. */ + p1 = p; + + /* Push the number of bytes in the bitmap. */ + BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); + + /* Clear the whole map. */ + bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); + + /* charset_not matches newline according to a syntax bit. */ + if ((re_opcode_t) b[-2] == charset_not + && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) + SET_LIST_BIT ('\n'); + + /* Read in characters and ranges, setting map bits. */ + for (;;) + { + if (p == pend) return REG_EBRACK; + + PATFETCH (c); + + /* \ might escape characters inside [...] and [^...]. */ + if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') + { + if (p == pend) return REG_EESCAPE; + + PATFETCH (c1); + SET_LIST_BIT (c1); + continue; + } + + /* Could be the end of the bracket expression. If it's + not (i.e., when the bracket expression is `[]' so + far), the ']' character bit gets set way below. */ + if (c == ']' && p != p1 + 1) + break; + + /* Look ahead to see if it's a range when the last thing + was a character class. */ + if (had_char_class && c == '-' && *p != ']') + return REG_ERANGE; + + /* Look ahead to see if it's a range when the last thing + was a character: if this is a hyphen not at the + beginning or the end of a list, then it's the range + operator. */ + if (c == '-' + && !(p - 2 >= pattern && p[-2] == '[') + && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') + && *p != ']') + { + reg_errcode_t ret + = compile_range (&p, pend, translate, syntax, b); + if (ret != REG_NOERROR) return ret; + } + + else if (p[0] == '-' && p[1] != ']') + { /* This handles ranges made up of characters only. */ + reg_errcode_t ret; + + /* Move past the `-'. */ + PATFETCH (c1); + + ret = compile_range (&p, pend, translate, syntax, b); + if (ret != REG_NOERROR) return ret; + } + + /* See if we're at the beginning of a possible character + class. */ + + else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') + { /* Leave room for the null. */ + char str[CHAR_CLASS_MAX_LENGTH + 1]; + + PATFETCH (c); + c1 = 0; + + /* If pattern is `[[:'. */ + if (p == pend) return REG_EBRACK; + + for (;;) + { + PATFETCH (c); + if (c == ':' || c == ']' || p == pend + || c1 == CHAR_CLASS_MAX_LENGTH) + break; + str[c1++] = c; + } + str[c1] = '\0'; + + /* If isn't a word bracketed by `[:' and:`]': + undo the ending character, the letters, and leave + the leading `:' and `[' (but set bits for them). */ + if (c == ':' && *p == ']') + { + int ch; + boolean is_alnum = STREQ (str, "alnum"); + boolean is_alpha = STREQ (str, "alpha"); + boolean is_blank = STREQ (str, "blank"); + boolean is_cntrl = STREQ (str, "cntrl"); + boolean is_digit = STREQ (str, "digit"); + boolean is_graph = STREQ (str, "graph"); + boolean is_lower = STREQ (str, "lower"); + boolean is_print = STREQ (str, "print"); + boolean is_punct = STREQ (str, "punct"); + boolean is_space = STREQ (str, "space"); + boolean is_upper = STREQ (str, "upper"); + boolean is_xdigit = STREQ (str, "xdigit"); + + if (!IS_CHAR_CLASS (str)) return REG_ECTYPE; + + /* Throw away the ] at the end of the character + class. */ + PATFETCH (c); + + if (p == pend) return REG_EBRACK; + + for (ch = 0; ch < 1 << BYTEWIDTH; ch++) + { + if ( (is_alnum && isalnum (ch)) + || (is_alpha && isalpha (ch)) + || (is_blank && isblank (ch)) + || (is_cntrl && iscntrl (ch)) + || (is_digit && isdigit (ch)) + || (is_graph && isgraph (ch)) + || (is_lower && islower (ch)) + || (is_print && isprint (ch)) + || (is_punct && ispunct (ch)) + || (is_space && isspace (ch)) + || (is_upper && isupper (ch)) + || (is_xdigit && isxdigit (ch))) + SET_LIST_BIT (ch); + } + had_char_class = true; + } + else + { + c1++; + while (c1--) + PATUNFETCH; + SET_LIST_BIT ('['); + SET_LIST_BIT (':'); + had_char_class = false; + } + } + else + { + had_char_class = false; + SET_LIST_BIT (c); + } + } + + /* Discard any (non)matching list bytes that are all 0 at the + end of the map. Decrease the map-length byte too. */ + while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) + b[-1]--; + b += b[-1]; + } + break; + + + case '(': + if (syntax & RE_NO_BK_PARENS) + goto handle_open; + else + goto normal_char; + + + case ')': + if (syntax & RE_NO_BK_PARENS) + goto handle_close; + else + goto normal_char; + + + case '\n': + if (syntax & RE_NEWLINE_ALT) + goto handle_alt; + else + goto normal_char; + + + case '|': + if (syntax & RE_NO_BK_VBAR) + goto handle_alt; + else + goto normal_char; + + + case '{': + if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) + goto handle_interval; + else + goto normal_char; + + + case '\\': + if (p == pend) return REG_EESCAPE; + + /* Do not translate the character after the \, so that we can + distinguish, e.g., \B from \b, even if we normally would + translate, e.g., B to b. */ + PATFETCH_RAW (c); + + switch (c) + { + case '(': + if (syntax & RE_NO_BK_PARENS) + goto normal_backslash; + + handle_open: + bufp->re_nsub++; + regnum++; + + if (COMPILE_STACK_FULL) + { + RETALLOC (compile_stack.stack, compile_stack.size << 1, + compile_stack_elt_t); + if (compile_stack.stack == NULL) return REG_ESPACE; + + compile_stack.size <<= 1; + } + + /* These are the values to restore when we hit end of this + group. They are all relative offsets, so that if the + whole pattern moves because of realloc, they will still + be valid. */ + COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; + COMPILE_STACK_TOP.fixup_alt_jump + = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; + COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; + COMPILE_STACK_TOP.regnum = regnum; + + /* We will eventually replace the 0 with the number of + groups inner to this one. But do not push a + start_memory for groups beyond the last one we can + represent in the compiled pattern. */ + if (regnum <= MAX_REGNUM) + { + COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2; + BUF_PUSH_3 (start_memory, regnum, 0); + } + + compile_stack.avail++; + + fixup_alt_jump = 0; + laststart = 0; + begalt = b; + break; + + + case ')': + if (syntax & RE_NO_BK_PARENS) goto normal_backslash; + + if (COMPILE_STACK_EMPTY) + if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) + goto normal_backslash; + else + return REG_ERPAREN; + + handle_close: + if (fixup_alt_jump) + { /* Push a dummy failure point at the end of the + alternative for a possible future + `pop_failure_jump' to pop. See comments at + `push_dummy_failure' in `re_match_2'. */ + BUF_PUSH (push_dummy_failure); + + /* We allocated space for this jump when we assigned + to `fixup_alt_jump', in the `handle_alt' case below. */ + STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1); + } + + /* See similar code for backslashed left paren above. */ + if (COMPILE_STACK_EMPTY) + if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) + goto normal_char; + else + return REG_ERPAREN; + + /* Since we just checked for an empty stack above, this + ``can't happen''. */ + assert (compile_stack.avail != 0); + { + /* We don't just want to restore into `regnum', because + later groups should continue to be numbered higher, + as in `(ab)c(de)' -- the second group is #2. */ + regnum_t this_group_regnum; + + compile_stack.avail--; + begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; + fixup_alt_jump + = COMPILE_STACK_TOP.fixup_alt_jump + ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 + : 0; + laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; + this_group_regnum = COMPILE_STACK_TOP.regnum; + + /* We're at the end of the group, so now we know how many + groups were inside this one. */ + if (this_group_regnum <= MAX_REGNUM) + { + unsigned char *inner_group_loc + = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; + + *inner_group_loc = regnum - this_group_regnum; + BUF_PUSH_3 (stop_memory, this_group_regnum, + regnum - this_group_regnum); + } + } + break; + + + case '|': /* `\|'. */ + if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) + goto normal_backslash; + handle_alt: + if (syntax & RE_LIMITED_OPS) + goto normal_char; + + /* Insert before the previous alternative a jump which + jumps to this alternative if the former fails. */ + GET_BUFFER_SPACE (3); + INSERT_JUMP (on_failure_jump, begalt, b + 6); + pending_exact = 0; + b += 3; + + /* The alternative before this one has a jump after it + which gets executed if it gets matched. Adjust that + jump so it will jump to this alternative's analogous + jump (put in below, which in turn will jump to the next + (if any) alternative's such jump, etc.). The last such + jump jumps to the correct final destination. A picture: + _____ _____ + | | | | + | v | v + a | b | c + + If we are at `b,' then fixup_alt_jump right now points to a + three-byte space after `a.' We'll put in the jump, set + fixup_alt_jump to right after `b,' and leave behind three + bytes which we'll fill in when we get to after `c.' */ + + if (fixup_alt_jump) + STORE_JUMP (jump_past_alt, fixup_alt_jump, b); + + /* Mark and leave space for a jump after this alternative, + to be filled in later either by next alternative or + when know we're at the end of a series of alternatives. */ + fixup_alt_jump = b; + GET_BUFFER_SPACE (3); + b += 3; + + laststart = 0; + begalt = b; + break; + + + case '{': + /* If \{ is a literal. */ + if (!(syntax & RE_INTERVALS) + /* If we're at `\{' and it's not the open-interval + operator. */ + || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) + || (p - 2 == pattern && p == pend)) + goto normal_backslash; + + handle_interval: + { + /* If got here, then the syntax allows intervals. */ + + /* At least (most) this many matches must be made. */ + int lower_bound = -1, upper_bound = -1; + + beg_interval = p - 1; + + if (p == pend) + { + if (syntax & RE_NO_BK_BRACES) + goto unfetch_interval; + else + return REG_EBRACE; + } + + GET_UNSIGNED_NUMBER (lower_bound); + + if (c == ',') + { + GET_UNSIGNED_NUMBER (upper_bound); + if (upper_bound < 0) upper_bound = RE_DUP_MAX; + } + else + /* Interval such as `{1}' => match exactly once. */ + upper_bound = lower_bound; + + if (lower_bound < 0 || upper_bound > RE_DUP_MAX + || lower_bound > upper_bound) + { + if (syntax & RE_NO_BK_BRACES) + goto unfetch_interval; + else + return REG_BADBR; + } + + if (!(syntax & RE_NO_BK_BRACES)) + { + if (c != '\\') return REG_EBRACE; + + PATFETCH (c); + } + + if (c != '}') + { + if (syntax & RE_NO_BK_BRACES) + goto unfetch_interval; + else + return REG_BADBR; + } + + /* We just parsed a valid interval. */ + + /* If it's invalid to have no preceding re. */ + if (!laststart) + { + if (syntax & RE_CONTEXT_INVALID_OPS) + return REG_BADRPT; + else if (syntax & RE_CONTEXT_INDEP_OPS) + laststart = b; + else + goto unfetch_interval; + } + + /* If the upper bound is zero, don't want to succeed at + all; jump from `laststart' to `b + 3', which will be + the end of the buffer after we insert the jump. */ + if (upper_bound == 0) + { + GET_BUFFER_SPACE (3); + INSERT_JUMP (jump, laststart, b + 3); + b += 3; + } + + /* Otherwise, we have a nontrivial interval. When + we're all done, the pattern will look like: + set_number_at + set_number_at + succeed_n + + jump_n + (The upper bound and `jump_n' are omitted if + `upper_bound' is 1, though.) */ + else + { /* If the upper bound is > 1, we need to insert + more at the end of the loop. */ + unsigned nbytes = 10 + (upper_bound > 1) * 10; + + GET_BUFFER_SPACE (nbytes); + + /* Initialize lower bound of the `succeed_n', even + though it will be set during matching by its + attendant `set_number_at' (inserted next), + because `re_compile_fastmap' needs to know. + Jump to the `jump_n' we might insert below. */ + INSERT_JUMP2 (succeed_n, laststart, + b + 5 + (upper_bound > 1) * 5, + lower_bound); + b += 5; + + /* Code to initialize the lower bound. Insert + before the `succeed_n'. The `5' is the last two + bytes of this `set_number_at', plus 3 bytes of + the following `succeed_n'. */ + insert_op2 (set_number_at, laststart, 5, lower_bound, b); + b += 5; + + if (upper_bound > 1) + { /* More than one repetition is allowed, so + append a backward jump to the `succeed_n' + that starts this interval. + + When we've reached this during matching, + we'll have matched the interval once, so + jump back only `upper_bound - 1' times. */ + STORE_JUMP2 (jump_n, b, laststart + 5, + upper_bound - 1); + b += 5; + + /* The location we want to set is the second + parameter of the `jump_n'; that is `b-2' as + an absolute address. `laststart' will be + the `set_number_at' we're about to insert; + `laststart+3' the number to set, the source + for the relative address. But we are + inserting into the middle of the pattern -- + so everything is getting moved up by 5. + Conclusion: (b - 2) - (laststart + 3) + 5, + i.e., b - laststart. + + We insert this at the beginning of the loop + so that if we fail during matching, we'll + reinitialize the bounds. */ + insert_op2 (set_number_at, laststart, b - laststart, + upper_bound - 1, b); + b += 5; + } + } + pending_exact = 0; + beg_interval = NULL; + } + break; + + unfetch_interval: + /* If an invalid interval, match the characters as literals. */ + assert (beg_interval); + p = beg_interval; + beg_interval = NULL; + + /* normal_char and normal_backslash need `c'. */ + PATFETCH (c); + + if (!(syntax & RE_NO_BK_BRACES)) + { + if (p > pattern && p[-1] == '\\') + goto normal_backslash; + } + goto normal_char; + +#ifdef emacs + /* There is no way to specify the before_dot and after_dot + operators. rms says this is ok. --karl */ + case '=': + BUF_PUSH (at_dot); + break; + + case 's': + laststart = b; + PATFETCH (c); + BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); + break; + + case 'S': + laststart = b; + PATFETCH (c); + BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); + break; +#endif /* emacs */ + + + case 'w': + laststart = b; + BUF_PUSH (wordchar); + break; + + + case 'W': + laststart = b; + BUF_PUSH (notwordchar); + break; + + + case '<': + BUF_PUSH (wordbeg); + break; + + case '>': + BUF_PUSH (wordend); + break; + + case 'b': + BUF_PUSH (wordbound); + break; + + case 'B': + BUF_PUSH (notwordbound); + break; + + case '`': + BUF_PUSH (begbuf); + break; + + case '\'': + BUF_PUSH (endbuf); + break; + + case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': + if (syntax & RE_NO_BK_REFS) + goto normal_char; + + c1 = c - '0'; + + if (c1 > regnum) + return REG_ESUBREG; + + /* Can't back reference to a subexpression if inside of it. */ + if (group_in_compile_stack (compile_stack, c1)) + goto normal_char; + + laststart = b; + BUF_PUSH_2 (duplicate, c1); + break; + + + case '+': + case '?': + if (syntax & RE_BK_PLUS_QM) + goto handle_plus; + else + goto normal_backslash; + + default: + normal_backslash: + /* You might think it would be useful for \ to mean + not to translate; but if we don't translate it + it will never match anything. */ + c = TRANSLATE (c); + goto normal_char; + } + break; + + + default: + /* Expects the character in `c'. */ + normal_char: + /* If no exactn currently being built. */ + if (!pending_exact + + /* If last exactn not at current position. */ + || pending_exact + *pending_exact + 1 != b + + /* We have only one byte following the exactn for the count. */ + || *pending_exact == (1 << BYTEWIDTH) - 1 + + /* If followed by a repetition operator. */ + || *p == '*' || *p == '^' + || ((syntax & RE_BK_PLUS_QM) + ? *p == '\\' && (p[1] == '+' || p[1] == '?') + : (*p == '+' || *p == '?')) + || ((syntax & RE_INTERVALS) + && ((syntax & RE_NO_BK_BRACES) + ? *p == '{' + : (p[0] == '\\' && p[1] == '{')))) + { + /* Start building a new exactn. */ + + laststart = b; + + BUF_PUSH_2 (exactn, 0); + pending_exact = b - 1; + } + + BUF_PUSH (c); + (*pending_exact)++; + break; + } /* switch (c) */ + } /* while p != pend */ + + + /* Through the pattern now. */ + + if (fixup_alt_jump) + STORE_JUMP (jump_past_alt, fixup_alt_jump, b); + + if (!COMPILE_STACK_EMPTY) + return REG_EPAREN; + + free (compile_stack.stack); + + /* We have succeeded; set the length of the buffer. */ + bufp->used = b - bufp->buffer; + +#ifdef DEBUG + if (debug) + { + DEBUG_PRINT1 ("\nCompiled pattern: "); + print_compiled_pattern (bufp); + } +#endif /* DEBUG */ + + return REG_NOERROR; +} /* regex_compile */ + +/* Subroutines for `regex_compile'. */ + +/* Store OP at LOC followed by two-byte integer parameter ARG. */ + +static void +store_op1 (op, loc, arg) + re_opcode_t op; + unsigned char *loc; + int arg; +{ + *loc = (unsigned char) op; + STORE_NUMBER (loc + 1, arg); +} + + +/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ + +static void +store_op2 (op, loc, arg1, arg2) + re_opcode_t op; + unsigned char *loc; + int arg1, arg2; +{ + *loc = (unsigned char) op; + STORE_NUMBER (loc + 1, arg1); + STORE_NUMBER (loc + 3, arg2); +} + + +/* Copy the bytes from LOC to END to open up three bytes of space at LOC + for OP followed by two-byte integer parameter ARG. */ + +static void +insert_op1 (op, loc, arg, end) + re_opcode_t op; + unsigned char *loc; + int arg; + unsigned char *end; +{ + register unsigned char *pfrom = end; + register unsigned char *pto = end + 3; + + while (pfrom != loc) + *--pto = *--pfrom; + + store_op1 (op, loc, arg); +} + + +/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ + +static void +insert_op2 (op, loc, arg1, arg2, end) + re_opcode_t op; + unsigned char *loc; + int arg1, arg2; + unsigned char *end; +{ + register unsigned char *pfrom = end; + register unsigned char *pto = end + 5; + + while (pfrom != loc) + *--pto = *--pfrom; + + store_op2 (op, loc, arg1, arg2); +} + + +/* P points to just after a ^ in PATTERN. Return true if that ^ comes + after an alternative or a begin-subexpression. We assume there is at + least one character before the ^. */ + +static boolean +at_begline_loc_p (pattern, p, syntax) + const char *pattern, *p; + reg_syntax_t syntax; +{ + const char *prev = p - 2; + boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; + + return + /* After a subexpression? */ + (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) + /* After an alternative? */ + || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); +} + + +/* The dual of at_begline_loc_p. This one is for $. We assume there is + at least one character after the $, i.e., `P < PEND'. */ + +static boolean +at_endline_loc_p (p, pend, syntax) + const char *p, *pend; + int syntax; +{ + const char *next = p; + boolean next_backslash = *next == '\\'; + const char *next_next = p + 1 < pend ? p + 1 : NULL; + + return + /* Before a subexpression? */ + (syntax & RE_NO_BK_PARENS ? *next == ')' + : next_backslash && next_next && *next_next == ')') + /* Before an alternative? */ + || (syntax & RE_NO_BK_VBAR ? *next == '|' + : next_backslash && next_next && *next_next == '|'); +} + + +/* Returns true if REGNUM is in one of COMPILE_STACK's elements and + false if it's not. */ + +static boolean +group_in_compile_stack (compile_stack, regnum) + compile_stack_type compile_stack; + regnum_t regnum; +{ + int this_element; + + for (this_element = compile_stack.avail - 1; + this_element >= 0; + this_element--) + if (compile_stack.stack[this_element].regnum == regnum) + return true; + + return false; +} + + +/* Read the ending character of a range (in a bracket expression) from the + uncompiled pattern *P_PTR (which ends at PEND). We assume the + starting character is in `P[-2]'. (`P[-1]' is the character `-'.) + Then we set the translation of all bits between the starting and + ending characters (inclusive) in the compiled pattern B. + + Return an error code. + + We use these short variable names so we can use the same macros as + `regex_compile' itself. */ + +static reg_errcode_t +compile_range (p_ptr, pend, translate, syntax, b) + const char **p_ptr, *pend; + char *translate; + reg_syntax_t syntax; + unsigned char *b; +{ + unsigned this_char; + + const char *p = *p_ptr; + + /* Even though the pattern is a signed `char *', we need to fetch into + `unsigned char's. Reason: if the high bit of the pattern character + is set, the range endpoints will be negative if we fetch into a + signed `char *'. */ + unsigned char range_end; + unsigned char range_start = p[-2]; + + if (p == pend) + return REG_ERANGE; + + PATFETCH (range_end); + + /* Have to increment the pointer into the pattern string, so the + caller isn't still at the ending character. */ + (*p_ptr)++; + + /* If the start is after the end, the range is empty. */ + if (range_start > range_end) + return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; + + /* Here we see why `this_char' has to be larger than an `unsigned + char' -- the range is inclusive, so if `range_end' == 0xff + (assuming 8-bit characters), we would otherwise go into an infinite + loop, since all characters <= 0xff. */ + for (this_char = range_start; this_char <= range_end; this_char++) + { + SET_LIST_BIT (TRANSLATE (this_char)); + } + + return REG_NOERROR; +} + +/* Failure stack declarations and macros; both re_compile_fastmap and + re_match_2 use a failure stack. These have to be macros because of + REGEX_ALLOCATE. */ + + +/* Number of failure points for which to initially allocate space + when matching. If this number is exceeded, we allocate more + space, so it is not a hard limit. */ +#ifndef INIT_FAILURE_ALLOC +#define INIT_FAILURE_ALLOC 5 +#endif + +/* Roughly the maximum number of failure points on the stack. Would be + exactly that if always used MAX_FAILURE_SPACE each time we failed. + This is a variable only so users of regex can assign to it; we never + change it ourselves. */ +int re_max_failures = 2000; + +typedef const unsigned char *fail_stack_elt_t; + +typedef struct +{ + fail_stack_elt_t *stack; + unsigned size; + unsigned avail; /* Offset of next open position. */ +} fail_stack_type; + +#define FAIL_STACK_EMPTY() (fail_stack.avail == 0) +#define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) +#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) +#define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail]) + + +/* Initialize `fail_stack'. Do `return -2' if the alloc fails. */ + +#define INIT_FAIL_STACK() \ + do { \ + fail_stack.stack = (fail_stack_elt_t *) \ + REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ + \ + if (fail_stack.stack == NULL) \ + return -2; \ + \ + fail_stack.size = INIT_FAILURE_ALLOC; \ + fail_stack.avail = 0; \ + } while (0) + + +/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. + + Return 1 if succeeds, and 0 if either ran out of memory + allocating space for it or it was already too large. + + REGEX_REALLOCATE requires `destination' be declared. */ + +#define DOUBLE_FAIL_STACK(fail_stack) \ + ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \ + ? 0 \ + : ((fail_stack).stack = (fail_stack_elt_t *) \ + REGEX_REALLOCATE ((fail_stack).stack, \ + (fail_stack).size * sizeof (fail_stack_elt_t), \ + ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ + \ + (fail_stack).stack == NULL \ + ? 0 \ + : ((fail_stack).size <<= 1, \ + 1))) + + +/* Push PATTERN_OP on FAIL_STACK. + + Return 1 if was able to do so and 0 if ran out of memory allocating + space to do so. */ +#define PUSH_PATTERN_OP(pattern_op, fail_stack) \ + ((FAIL_STACK_FULL () \ + && !DOUBLE_FAIL_STACK (fail_stack)) \ + ? 0 \ + : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \ + 1)) + +/* This pushes an item onto the failure stack. Must be a four-byte + value. Assumes the variable `fail_stack'. Probably should only + be called from within `PUSH_FAILURE_POINT'. */ +#define PUSH_FAILURE_ITEM(item) \ + fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item + +/* The complement operation. Assumes `fail_stack' is nonempty. */ +#define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail] + +/* Used to omit pushing failure point id's when we're not debugging. */ +#ifdef DEBUG +#define DEBUG_PUSH PUSH_FAILURE_ITEM +#define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM () +#else +#define DEBUG_PUSH(item) +#define DEBUG_POP(item_addr) +#endif + + +/* Push the information about the state we will need + if we ever fail back to it. + + Requires variables fail_stack, regstart, regend, reg_info, and + num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be + declared. + + Does `return FAILURE_CODE' if runs out of memory. */ + +#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ + do { \ + char *destination; \ + /* Must be int, so when we don't save any registers, the arithmetic \ + of 0 + -1 isn't done as unsigned. */ \ + int this_reg; \ + \ + DEBUG_STATEMENT (failure_id++); \ + DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ + DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ + DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ + \ + DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \ + DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ + \ + /* Ensure we have enough space allocated for what we will push. */ \ + while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ + { \ + if (!DOUBLE_FAIL_STACK (fail_stack)) \ + return failure_code; \ + \ + DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ + (fail_stack).size); \ + DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ + } \ + \ + /* Push the info, starting with the registers. */ \ + DEBUG_PRINT1 ("\n"); \ + \ + for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ + this_reg++) \ + { \ + DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \ + DEBUG_STATEMENT (num_regs_pushed++); \ + \ + DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ + PUSH_FAILURE_ITEM (regstart[this_reg]); \ + \ + DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ + PUSH_FAILURE_ITEM (regend[this_reg]); \ + \ + DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \ + DEBUG_PRINT2 (" match_null=%d", \ + REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ + DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ + DEBUG_PRINT2 (" matched_something=%d", \ + MATCHED_SOMETHING (reg_info[this_reg])); \ + DEBUG_PRINT2 (" ever_matched=%d", \ + EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ + DEBUG_PRINT1 ("\n"); \ + PUSH_FAILURE_ITEM (reg_info[this_reg].word); \ + } \ + \ + DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\ + PUSH_FAILURE_ITEM (lowest_active_reg); \ + \ + DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\ + PUSH_FAILURE_ITEM (highest_active_reg); \ + \ + DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \ + DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ + PUSH_FAILURE_ITEM (pattern_place); \ + \ + DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \ + DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ + size2); \ + DEBUG_PRINT1 ("'\n"); \ + PUSH_FAILURE_ITEM (string_place); \ + \ + DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ + DEBUG_PUSH (failure_id); \ + } while (0) + +/* This is the number of items that are pushed and popped on the stack + for each register. */ +#define NUM_REG_ITEMS 3 + +/* Individual items aside from the registers. */ +#ifdef DEBUG +#define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ +#else +#define NUM_NONREG_ITEMS 4 +#endif + +/* We push at most this many items on the stack. */ +#define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS) + +/* We actually push this many items. */ +#define NUM_FAILURE_ITEMS \ + ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \ + + NUM_NONREG_ITEMS) + +/* How many items can still be added to the stack without overflowing it. */ +#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) + + +/* Pops what PUSH_FAIL_STACK pushes. + + We restore into the parameters, all of which should be lvalues: + STR -- the saved data position. + PAT -- the saved pattern position. + LOW_REG, HIGH_REG -- the highest and lowest active registers. + REGSTART, REGEND -- arrays of string positions. + REG_INFO -- array of information about each subexpression. + + Also assumes the variables `fail_stack' and (if debugging), `bufp', + `pend', `string1', `size1', `string2', and `size2'. */ + +#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ +{ \ + DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \ + int this_reg; \ + const unsigned char *string_temp; \ + \ + assert (!FAIL_STACK_EMPTY ()); \ + \ + /* Remove failure points and point to how many regs pushed. */ \ + DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ + DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ + DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ + \ + assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ + \ + DEBUG_POP (&failure_id); \ + DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ + \ + /* If the saved string location is NULL, it came from an \ + on_failure_keep_string_jump opcode, and we want to throw away the \ + saved NULL, thus retaining our current position in the string. */ \ + string_temp = POP_FAILURE_ITEM (); \ + if (string_temp != NULL) \ + str = (const char *) string_temp; \ + \ + DEBUG_PRINT2 (" Popping string 0x%x: `", str); \ + DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ + DEBUG_PRINT1 ("'\n"); \ + \ + pat = (unsigned char *) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \ + DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ + \ + /* Restore register info. */ \ + high_reg = (unsigned) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \ + \ + low_reg = (unsigned) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \ + \ + for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ + { \ + DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \ + \ + reg_info[this_reg].word = POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \ + \ + regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ + \ + regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ + } \ +} /* POP_FAILURE_POINT */ + +/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in + BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible + characters can start a string that matches the pattern. This fastmap + is used by re_search to skip quickly over impossible starting points. + + The caller must supply the address of a (1 << BYTEWIDTH)-byte data + area as BUFP->fastmap. + + We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in + the pattern buffer. + + Returns 0 if we succeed, -2 if an internal error. */ + +int +re_compile_fastmap (bufp) + struct re_pattern_buffer *bufp; +{ + int j, k; + fail_stack_type fail_stack; +#ifndef REGEX_MALLOC + char *destination; +#endif + /* We don't push any register information onto the failure stack. */ + unsigned num_regs = 0; + + register char *fastmap = bufp->fastmap; + unsigned char *pattern = bufp->buffer; + unsigned long size = bufp->used; + const unsigned char *p = pattern; + register unsigned char *pend = pattern + size; + + /* Assume that each path through the pattern can be null until + proven otherwise. We set this false at the bottom of switch + statement, to which we get only if a particular path doesn't + match the empty string. */ + boolean path_can_be_null = true; + + /* We aren't doing a `succeed_n' to begin with. */ + boolean succeed_n_p = false; + + assert (fastmap != NULL && p != NULL); + + INIT_FAIL_STACK (); + bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ + bufp->fastmap_accurate = 1; /* It will be when we're done. */ + bufp->can_be_null = 0; + + while (p != pend || !FAIL_STACK_EMPTY ()) + { + if (p == pend) + { + bufp->can_be_null |= path_can_be_null; + + /* Reset for next path. */ + path_can_be_null = true; + + p = fail_stack.stack[--fail_stack.avail]; + } + + /* We should never be about to go beyond the end of the pattern. */ + assert (p < pend); + +#ifdef SWITCH_ENUM_BUG + switch ((int) ((re_opcode_t) *p++)) +#else + switch ((re_opcode_t) *p++) +#endif + { + + /* I guess the idea here is to simply not bother with a fastmap + if a backreference is used, since it's too hard to figure out + the fastmap for the corresponding group. Setting + `can_be_null' stops `re_search_2' from using the fastmap, so + that is all we do. */ + case duplicate: + bufp->can_be_null = 1; + return 0; + + + /* Following are the cases which match a character. These end + with `break'. */ + + case exactn: + fastmap[p[1]] = 1; + break; + + + case charset: + for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) + if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) + fastmap[j] = 1; + break; + + + case charset_not: + /* Chars beyond end of map must be allowed. */ + for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) + fastmap[j] = 1; + + for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) + if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) + fastmap[j] = 1; + break; + + + case wordchar: + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX (j) == Sword) + fastmap[j] = 1; + break; + + + case notwordchar: + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX (j) != Sword) + fastmap[j] = 1; + break; + + + case anychar: + /* `.' matches anything ... */ + for (j = 0; j < (1 << BYTEWIDTH); j++) + fastmap[j] = 1; + + /* ... except perhaps newline. */ + if (!(bufp->syntax & RE_DOT_NEWLINE)) + fastmap['\n'] = 0; + + /* Return if we have already set `can_be_null'; if we have, + then the fastmap is irrelevant. Something's wrong here. */ + else if (bufp->can_be_null) + return 0; + + /* Otherwise, have to check alternative paths. */ + break; + + +#ifdef emacs + case syntaxspec: + k = *p++; + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX (j) == (enum syntaxcode) k) + fastmap[j] = 1; + break; + + + case notsyntaxspec: + k = *p++; + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX (j) != (enum syntaxcode) k) + fastmap[j] = 1; + break; + + + /* All cases after this match the empty string. These end with + `continue'. */ + + + case before_dot: + case at_dot: + case after_dot: + continue; +#endif /* not emacs */ + + + case no_op: + case begline: + case endline: + case begbuf: + case endbuf: + case wordbound: + case notwordbound: + case wordbeg: + case wordend: + case push_dummy_failure: + continue; + + + case jump_n: + case pop_failure_jump: + case maybe_pop_jump: + case jump: + case jump_past_alt: + case dummy_failure_jump: + EXTRACT_NUMBER_AND_INCR (j, p); + p += j; + if (j > 0) + continue; + + /* Jump backward implies we just went through the body of a + loop and matched nothing. Opcode jumped to should be + `on_failure_jump' or `succeed_n'. Just treat it like an + ordinary jump. For a * loop, it has pushed its failure + point already; if so, discard that as redundant. */ + if ((re_opcode_t) *p != on_failure_jump + && (re_opcode_t) *p != succeed_n) + continue; + + p++; + EXTRACT_NUMBER_AND_INCR (j, p); + p += j; + + /* If what's on the stack is where we are now, pop it. */ + if (!FAIL_STACK_EMPTY () + && fail_stack.stack[fail_stack.avail - 1] == p) + fail_stack.avail--; + + continue; + + + case on_failure_jump: + case on_failure_keep_string_jump: + handle_on_failure_jump: + EXTRACT_NUMBER_AND_INCR (j, p); + + /* For some patterns, e.g., `(a?)?', `p+j' here points to the + end of the pattern. We don't want to push such a point, + since when we restore it above, entering the switch will + increment `p' past the end of the pattern. We don't need + to push such a point since we obviously won't find any more + fastmap entries beyond `pend'. Such a pattern can match + the null string, though. */ + if (p + j < pend) + { + if (!PUSH_PATTERN_OP (p + j, fail_stack)) + return -2; + } + else + bufp->can_be_null = 1; + + if (succeed_n_p) + { + EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ + succeed_n_p = false; + } + + continue; + + + case succeed_n: + /* Get to the number of times to succeed. */ + p += 2; + + /* Increment p past the n for when k != 0. */ + EXTRACT_NUMBER_AND_INCR (k, p); + if (k == 0) + { + p -= 4; + succeed_n_p = true; /* Spaghetti code alert. */ + goto handle_on_failure_jump; + } + continue; + + + case set_number_at: + p += 4; + continue; + + + case start_memory: + case stop_memory: + p += 2; + continue; + + + default: + abort (); /* We have listed all the cases. */ + } /* switch *p++ */ + + /* Getting here means we have found the possible starting + characters for one path of the pattern -- and that the empty + string does not match. We need not follow this path further. + Instead, look at the next alternative (remembered on the + stack), or quit if no more. The test at the top of the loop + does these things. */ + path_can_be_null = false; + p = pend; + } /* while p */ + + /* Set `can_be_null' for the last path (also the first path, if the + pattern is empty). */ + bufp->can_be_null |= path_can_be_null; + return 0; +} /* re_compile_fastmap */ + +/* Set REGS to hold NUM_REGS registers, storing them in STARTS and + ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use + this memory for recording register information. STARTS and ENDS + must be allocated using the malloc library routine, and must each + be at least NUM_REGS * sizeof (regoff_t) bytes long. + + If NUM_REGS == 0, then subsequent matches should allocate their own + register data. + + Unless this function is called, the first search or match using + PATTERN_BUFFER will allocate its own register data, without + freeing the old data. */ + +void +re_set_registers (bufp, regs, num_regs, starts, ends) + struct re_pattern_buffer *bufp; + struct re_registers *regs; + unsigned num_regs; + regoff_t *starts, *ends; +{ + if (num_regs) + { + bufp->regs_allocated = REGS_REALLOCATE; + regs->num_regs = num_regs; + regs->start = starts; + regs->end = ends; + } + else + { + bufp->regs_allocated = REGS_UNALLOCATED; + regs->num_regs = 0; + regs->start = regs->end = (regoff_t) 0; + } +} + +/* Searching routines. */ + +/* Like re_search_2, below, but only one string is specified, and + doesn't let you say where to stop matching. */ + +int +re_search (bufp, string, size, startpos, range, regs) + struct re_pattern_buffer *bufp; + const char *string; + int size, startpos, range; + struct re_registers *regs; +{ + return re_search_2 (bufp, NULL, 0, string, size, startpos, range, + regs, size); +} + + +/* Using the compiled pattern in BUFP->buffer, first tries to match the + virtual concatenation of STRING1 and STRING2, starting first at index + STARTPOS, then at STARTPOS + 1, and so on. + + STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. + + RANGE is how far to scan while trying to match. RANGE = 0 means try + only at STARTPOS; in general, the last start tried is STARTPOS + + RANGE. + + In REGS, return the indices of the virtual concatenation of STRING1 + and STRING2 that matched the entire BUFP->buffer and its contained + subexpressions. + + Do not consider matching one past the index STOP in the virtual + concatenation of STRING1 and STRING2. + + We return either the position in the strings at which the match was + found, -1 if no match, or -2 if error (such as failure + stack overflow). */ + +int +re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) + struct re_pattern_buffer *bufp; + const char *string1, *string2; + int size1, size2; + int startpos; + int range; + struct re_registers *regs; + int stop; +{ + int val; + register char *fastmap = bufp->fastmap; + register char *translate = bufp->translate; + int total_size = size1 + size2; + int endpos = startpos + range; + + /* Check for out-of-range STARTPOS. */ + if (startpos < 0 || startpos > total_size) + return -1; + + /* Fix up RANGE if it might eventually take us outside + the virtual concatenation of STRING1 and STRING2. */ + if (endpos < -1) + range = -1 - startpos; + else if (endpos > total_size) + range = total_size - startpos; + + /* Update the fastmap now if not correct already. */ + if (fastmap && !bufp->fastmap_accurate) + if (re_compile_fastmap (bufp) == -2) + return -2; + + /* If the search isn't to be a backwards one, don't waste time in a + long search for a pattern that says it is anchored. */ + if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf + && range > 0) + { + if (startpos > 0) + return -1; + else + range = 1; + } + + for (;;) + { + /* If a fastmap is supplied, skip quickly over characters that + cannot be the start of a match. If the pattern can match the + null string, however, we don't need to skip characters; we want + the first null string. */ + if (fastmap && startpos < total_size && !bufp->can_be_null) + { + if (range > 0) /* Searching forwards. */ + { + register const char *d; + register int lim = 0; + int irange = range; + + if (startpos < size1 && startpos + range >= size1) + lim = range - (size1 - startpos); + + d = (startpos >= size1 ? string2 - size1 : string1) + startpos; + + /* Written out as an if-else to avoid testing `translate' + inside the loop. */ + if (translate) + while (range > lim + && !fastmap[(unsigned char) translate[*d++]]) + range--; + else + while (range > lim && !fastmap[(unsigned char) *d++]) + range--; + + startpos += irange - range; + } + else /* Searching backwards. */ + { + register char c = (size1 == 0 || startpos >= size1 + ? string2[startpos - size1] + : string1[startpos]); + + if (!fastmap[TRANSLATE (c)]) + goto advance; + } + } + + /* If can't match the null string, and that's all we have left, fail. */ + if (range >= 0 && startpos == total_size && fastmap + && !bufp->can_be_null) + return -1; + + val = re_match_2 (bufp, string1, size1, string2, size2, + startpos, regs, stop); + if (val >= 0) + return startpos; + + if (val == -2) + return -2; + + advance: + if (!range) + break; + else if (range > 0) + { + range--; + startpos++; + } + else + { + range++; + startpos--; + } + } + return -1; +} /* re_search_2 */ + +/* Declarations and macros for re_match_2. */ + +static int bcmp_translate (); +static boolean alt_match_null_string_p (), + common_op_match_null_string_p (), + group_match_null_string_p (); + +/* Structure for per-register (a.k.a. per-group) information. + This must not be longer than one word, because we push this value + onto the failure stack. Other register information, such as the + starting and ending positions (which are addresses), and the list of + inner groups (which is a bits list) are maintained in separate + variables. + + We are making a (strictly speaking) nonportable assumption here: that + the compiler will pack our bit fields into something that fits into + the type of `word', i.e., is something that fits into one item on the + failure stack. */ +typedef union +{ + fail_stack_elt_t word; + struct + { + /* This field is one if this group can match the empty string, + zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ +#define MATCH_NULL_UNSET_VALUE 3 + unsigned match_null_string_p : 2; + unsigned is_active : 1; + unsigned matched_something : 1; + unsigned ever_matched_something : 1; + } bits; +} register_info_type; + +#define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) +#define IS_ACTIVE(R) ((R).bits.is_active) +#define MATCHED_SOMETHING(R) ((R).bits.matched_something) +#define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) + + +/* Call this when have matched something; it sets `matched' flags for the + registers corresponding to the group of which we currently are inside. + Also records whether this group ever matched something. We only care + about this information at `stop_memory', and then only about the + previous time through the loop (if the group is starred or whatever). + So it is ok to clear all the nonactive registers here. */ +#define SET_REGS_MATCHED() \ + do \ + { \ + unsigned r; \ + for (r = lowest_active_reg; r <= highest_active_reg; r++) \ + { \ + MATCHED_SOMETHING (reg_info[r]) \ + = EVER_MATCHED_SOMETHING (reg_info[r]) \ + = 1; \ + } \ + } \ + while (0) + + +/* This converts PTR, a pointer into one of the search strings `string1' + and `string2' into an offset from the beginning of that string. */ +#define POINTER_TO_OFFSET(ptr) \ + (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1) + +/* Registers are set to a sentinel when they haven't yet matched. */ +#define REG_UNSET_VALUE ((char *) -1) +#define REG_UNSET(e) ((e) == REG_UNSET_VALUE) + + +/* Macros for dealing with the split strings in re_match_2. */ + +#define MATCHING_IN_FIRST_STRING (dend == end_match_1) + +/* Call before fetching a character with *d. This switches over to + string2 if necessary. */ +#define PREFETCH() \ + while (d == dend) \ + { \ + /* End of string2 => fail. */ \ + if (dend == end_match_2) \ + goto fail; \ + /* End of string1 => advance to string2. */ \ + d = string2; \ + dend = end_match_2; \ + } + + +/* Test if at very beginning or at very end of the virtual concatenation + of `string1' and `string2'. If only one string, it's `string2'. */ +#define AT_STRINGS_BEG() (d == (size1 ? string1 : string2) || !size2) +#define AT_STRINGS_END() (d == end2) + + +/* Test if D points to a character which is word-constituent. We have + two special cases to check for: if past the end of string1, look at + the first character in string2; and if before the beginning of + string2, look at the last character in string1. + + Assumes `string1' exists, so use in conjunction with AT_STRINGS_BEG (). */ +#define LETTER_P(d) \ + (SYNTAX ((d) == end1 ? *string2 \ + : (d) == string2 - 1 ? *(end1 - 1) : *(d)) == Sword) + +/* Test if the character before D and the one at D differ with respect + to being word-constituent. */ +#define AT_WORD_BOUNDARY(d) \ + (AT_STRINGS_BEG () || AT_STRINGS_END () || LETTER_P (d - 1) != LETTER_P (d)) + + +/* Free everything we malloc. */ +#ifdef REGEX_MALLOC +#define FREE_VAR(var) if (var) free (var); var = NULL +#define FREE_VARIABLES() \ + do { \ + FREE_VAR (fail_stack.stack); \ + FREE_VAR (regstart); \ + FREE_VAR (regend); \ + FREE_VAR (old_regstart); \ + FREE_VAR (old_regend); \ + FREE_VAR (best_regstart); \ + FREE_VAR (best_regend); \ + FREE_VAR (reg_info); \ + FREE_VAR (reg_dummy); \ + FREE_VAR (reg_info_dummy); \ + } while (0) +#else /* not REGEX_MALLOC */ +/* Some MIPS systems (at least) want this to free alloca'd storage. */ +#define FREE_VARIABLES() alloca (0) +#endif /* not REGEX_MALLOC */ + + +/* These values must meet several constraints. They must not be valid + register values; since we have a limit of 255 registers (because + we use only one byte in the pattern for the register number), we can + use numbers larger than 255. They must differ by 1, because of + NUM_FAILURE_ITEMS above. And the value for the lowest register must + be larger than the value for the highest register, so we do not try + to actually save any registers when none are active. */ +#define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) +#define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) + +/* Matching routines. */ + +#ifndef emacs /* Emacs never uses this. */ +/* re_match is like re_match_2 except it takes only a single string. */ + +int +re_match (bufp, string, size, pos, regs) + struct re_pattern_buffer *bufp; + const char *string; + int size, pos; + struct re_registers *regs; + { + return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size); +} +#endif /* not emacs */ + + +/* re_match_2 matches the compiled pattern in BUFP against the + the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 + and SIZE2, respectively). We start matching at POS, and stop + matching at STOP. + + If REGS is non-null and the `no_sub' field of BUFP is nonzero, we + store offsets for the substring each group matched in REGS. See the + documentation for exactly how many groups we fill. + + We return -1 if no match, -2 if an internal error (such as the + failure stack overflowing). Otherwise, we return the length of the + matched substring. */ + +int +re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) + struct re_pattern_buffer *bufp; + const char *string1, *string2; + int size1, size2; + int pos; + struct re_registers *regs; + int stop; +{ + /* General temporaries. */ + int mcnt; + unsigned char *p1; + + /* Just past the end of the corresponding string. */ + const char *end1, *end2; + + /* Pointers into string1 and string2, just past the last characters in + each to consider matching. */ + const char *end_match_1, *end_match_2; + + /* Where we are in the data, and the end of the current string. */ + const char *d, *dend; + + /* Where we are in the pattern, and the end of the pattern. */ + unsigned char *p = bufp->buffer; + register unsigned char *pend = p + bufp->used; + + /* We use this to map every character in the string. */ + char *translate = bufp->translate; + + /* Failure point stack. Each place that can handle a failure further + down the line pushes a failure point on this stack. It consists of + restart, regend, and reg_info for all registers corresponding to + the subexpressions we're currently inside, plus the number of such + registers, and, finally, two char *'s. The first char * is where + to resume scanning the pattern; the second one is where to resume + scanning the strings. If the latter is zero, the failure point is + a ``dummy''; if a failure happens and the failure point is a dummy, + it gets discarded and the next next one is tried. */ + fail_stack_type fail_stack; +#ifdef DEBUG + static unsigned failure_id = 0; +#endif + + /* We fill all the registers internally, independent of what we + return, for use in backreferences. The number here includes + an element for register zero. */ + unsigned num_regs = bufp->re_nsub + 1; + + /* The currently active registers. */ + unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG; + unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG; + + /* Information on the contents of registers. These are pointers into + the input strings; they record just what was matched (on this + attempt) by a subexpression part of the pattern, that is, the + regnum-th regstart pointer points to where in the pattern we began + matching and the regnum-th regend points to right after where we + stopped matching the regnum-th subexpression. (The zeroth register + keeps track of what the whole pattern matches.) */ + const char **regstart, **regend; + + /* If a group that's operated upon by a repetition operator fails to + match anything, then the register for its start will need to be + restored because it will have been set to wherever in the string we + are when we last see its open-group operator. Similarly for a + register's end. */ + const char **old_regstart, **old_regend; + + /* The is_active field of reg_info helps us keep track of which (possibly + nested) subexpressions we are currently in. The matched_something + field of reg_info[reg_num] helps us tell whether or not we have + matched any of the pattern so far this time through the reg_num-th + subexpression. These two fields get reset each time through any + loop their register is in. */ + register_info_type *reg_info; + + /* The following record the register info as found in the above + variables when we find a match better than any we've seen before. + This happens as we backtrack through the failure points, which in + turn happens only if we have not yet matched the entire string. */ + unsigned best_regs_set = false; + const char **best_regstart, **best_regend; + + /* Logically, this is `best_regend[0]'. But we don't want to have to + allocate space for that if we're not allocating space for anything + else (see below). Also, we never need info about register 0 for + any of the other register vectors, and it seems rather a kludge to + treat `best_regend' differently than the rest. So we keep track of + the end of the best match so far in a separate variable. We + initialize this to NULL so that when we backtrack the first time + and need to test it, it's not garbage. */ + const char *match_end = NULL; + + /* Used when we pop values we don't care about. */ + const char **reg_dummy; + register_info_type *reg_info_dummy; + +#ifdef DEBUG + /* Counts the total number of registers pushed. */ + unsigned num_regs_pushed = 0; +#endif + + DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); + + INIT_FAIL_STACK (); + + /* Do not bother to initialize all the register variables if there are + no groups in the pattern, as it takes a fair amount of time. If + there are groups, we include space for register 0 (the whole + pattern), even though we never use it, since it simplifies the + array indexing. We should fix this. */ + if (bufp->re_nsub) + { + regstart = REGEX_TALLOC (num_regs, const char *); + regend = REGEX_TALLOC (num_regs, const char *); + old_regstart = REGEX_TALLOC (num_regs, const char *); + old_regend = REGEX_TALLOC (num_regs, const char *); + best_regstart = REGEX_TALLOC (num_regs, const char *); + best_regend = REGEX_TALLOC (num_regs, const char *); + reg_info = REGEX_TALLOC (num_regs, register_info_type); + reg_dummy = REGEX_TALLOC (num_regs, const char *); + reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); + + if (!(regstart && regend && old_regstart && old_regend && reg_info + && best_regstart && best_regend && reg_dummy && reg_info_dummy)) + { + FREE_VARIABLES (); + return -2; + } + } +#ifdef REGEX_MALLOC + else + { + /* We must initialize all our variables to NULL, so that + `FREE_VARIABLES' doesn't try to free them. Too bad this isn't + Lisp, so we could have a list of variables. As it is, */ + regstart = regend = old_regstart = old_regend = best_regstart + = best_regend = reg_dummy = NULL; + reg_info = reg_info_dummy = (register_info_type *) NULL; + } +#endif /* REGEX_MALLOC */ + + /* The starting position is bogus. */ + if (pos < 0 || pos > size1 + size2) + { + FREE_VARIABLES (); + return -1; + } + + /* Initialize subexpression text positions to -1 to mark ones that no + start_memory/stop_memory has been seen for. Also initialize the + register information struct. */ + for (mcnt = 1; mcnt < num_regs; mcnt++) + { + regstart[mcnt] = regend[mcnt] + = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; + + REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; + IS_ACTIVE (reg_info[mcnt]) = 0; + MATCHED_SOMETHING (reg_info[mcnt]) = 0; + EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; + } + + /* We move `string1' into `string2' if the latter's empty -- but not if + `string1' is null. */ + if (size2 == 0 && string1 != NULL) + { + string2 = string1; + size2 = size1; + string1 = 0; + size1 = 0; + } + end1 = string1 + size1; + end2 = string2 + size2; + + /* Compute where to stop matching, within the two strings. */ + if (stop <= size1) + { + end_match_1 = string1 + stop; + end_match_2 = string2; + } + else + { + end_match_1 = end1; + end_match_2 = string2 + stop - size1; + } + + /* `p' scans through the pattern as `d' scans through the data. + `dend' is the end of the input string that `d' points within. `d' + is advanced into the following input string whenever necessary, but + this happens before fetching; therefore, at the beginning of the + loop, `d' can be pointing at the end of a string, but it cannot + equal `string2'. */ + if (size1 > 0 && pos <= size1) + { + d = string1 + pos; + dend = end_match_1; + } + else + { + d = string2 + pos - size1; + dend = end_match_2; + } + + DEBUG_PRINT1 ("The compiled pattern is: "); + DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); + DEBUG_PRINT1 ("The string to match is: `"); + DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); + DEBUG_PRINT1 ("'\n"); + + /* This loops over pattern commands. It exits by returning from the + function if the match is complete, or it drops through if the match + fails at this starting point in the input data. */ + for (;;) + { + DEBUG_PRINT2 ("\n0x%x: ", p); + + if (p == pend) + { /* End of pattern means we might have succeeded. */ + DEBUG_PRINT1 ("End of pattern: "); + /* If not end of string, try backtracking. Otherwise done. */ + if (d != end_match_2) + { + DEBUG_PRINT1 ("backtracking.\n"); + + if (!FAIL_STACK_EMPTY ()) + { /* More failure points to try. */ + boolean same_str_p = (FIRST_STRING_P (match_end) + == MATCHING_IN_FIRST_STRING); + + /* If exceeds best match so far, save it. */ + if (!best_regs_set + || (same_str_p && d > match_end) + || (!same_str_p && !MATCHING_IN_FIRST_STRING)) + { + best_regs_set = true; + match_end = d; + + DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); + + for (mcnt = 1; mcnt < num_regs; mcnt++) + { + best_regstart[mcnt] = regstart[mcnt]; + best_regend[mcnt] = regend[mcnt]; + } + } + goto fail; + } + + /* If no failure points, don't restore garbage. */ + else if (best_regs_set) + { + restore_best_regs: + /* Restore best match. It may happen that `dend == + end_match_1' while the restored d is in string2. + For example, the pattern `x.*y.*z' against the + strings `x-' and `y-z-', if the two strings are + not consecutive in memory. */ + d = match_end; + dend = ((d >= string1 && d <= end1) + ? end_match_1 : end_match_2); + + for (mcnt = 1; mcnt < num_regs; mcnt++) + { + regstart[mcnt] = best_regstart[mcnt]; + regend[mcnt] = best_regend[mcnt]; + } + } + } /* d != end_match_2 */ + + DEBUG_PRINT1 ("\nAccepting match.\n"); + + /* If caller wants register contents data back, do it. */ + if (regs && !bufp->no_sub) + { + /* Have the register data arrays been allocated? */ + if (bufp->regs_allocated == REGS_UNALLOCATED) + { /* No. So allocate them with malloc. We need one + extra element beyond `num_regs' for the `-1' marker + GNU code uses. */ + regs->num_regs = MAX (RE_NREGS, num_regs + 1); + regs->start = TALLOC (regs->num_regs, regoff_t); + regs->end = TALLOC (regs->num_regs, regoff_t); + if (regs->start == NULL || regs->end == NULL) + return -2; + bufp->regs_allocated = REGS_REALLOCATE; + } + else if (bufp->regs_allocated == REGS_REALLOCATE) + { /* Yes. If we need more elements than were already + allocated, reallocate them. If we need fewer, just + leave it alone. */ + if (regs->num_regs < num_regs + 1) + { + regs->num_regs = num_regs + 1; + RETALLOC (regs->start, regs->num_regs, regoff_t); + RETALLOC (regs->end, regs->num_regs, regoff_t); + if (regs->start == NULL || regs->end == NULL) + return -2; + } + } + else + assert (bufp->regs_allocated == REGS_FIXED); + + /* Convert the pointer data in `regstart' and `regend' to + indices. Register zero has to be set differently, + since we haven't kept track of any info for it. */ + if (regs->num_regs > 0) + { + regs->start[0] = pos; + regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1 + : d - string2 + size1); + } + + /* Go through the first `min (num_regs, regs->num_regs)' + registers, since that is all we initialized. */ + for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++) + { + if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) + regs->start[mcnt] = regs->end[mcnt] = -1; + else + { + regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]); + regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]); + } + } + + /* If the regs structure we return has more elements than + were in the pattern, set the extra elements to -1. If + we (re)allocated the registers, this is the case, + because we always allocate enough to have at least one + -1 at the end. */ + for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++) + regs->start[mcnt] = regs->end[mcnt] = -1; + } /* regs && !bufp->no_sub */ + + FREE_VARIABLES (); + DEBUG_PRINT2 ("%d registers pushed.\n", num_regs_pushed); + + mcnt = d - pos - (MATCHING_IN_FIRST_STRING + ? string1 + : string2 - size1); + + DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); + + return mcnt; + } + + /* Otherwise match next pattern command. */ +#ifdef SWITCH_ENUM_BUG + switch ((int) ((re_opcode_t) *p++)) +#else + switch ((re_opcode_t) *p++) +#endif + { + /* Ignore these. Used to ignore the n of succeed_n's which + currently have n == 0. */ + case no_op: + DEBUG_PRINT1 ("EXECUTING no_op.\n"); + break; + + + /* Match the next n pattern characters exactly. The following + byte in the pattern defines n, and the n bytes after that + are the characters to match. */ + case exactn: + mcnt = *p++; + DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); + + /* This is written out as an if-else so we don't waste time + testing `translate' inside the loop. */ + if (translate) + { + do + { + PREFETCH (); + if (translate[(unsigned char) *d++] != (char) *p++) + goto fail; + } + while (--mcnt); + } + else + { + do + { + PREFETCH (); + if (*d++ != (char) *p++) goto fail; + } + while (--mcnt); + } + SET_REGS_MATCHED (); + break; + + + /* Match any character except possibly a newline or a null. */ + case anychar: + DEBUG_PRINT1 ("EXECUTING anychar.\n"); + + PREFETCH (); + + if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') + || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) + goto fail; + + SET_REGS_MATCHED (); + DEBUG_PRINT2 (" Matched `%d'.\n", *d); + d++; + break; + + + case charset: + case charset_not: + { + register unsigned char c; + boolean not = (re_opcode_t) *(p - 1) == charset_not; + + DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); + + PREFETCH (); + c = TRANSLATE (*d); /* The character to match. */ + + /* Cast to `unsigned' instead of `unsigned char' in case the + bit list is a full 32 bytes long. */ + if (c < (unsigned) (*p * BYTEWIDTH) + && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) + not = !not; + + p += 1 + *p; + + if (!not) goto fail; + + SET_REGS_MATCHED (); + d++; + break; + } + + + /* The beginning of a group is represented by start_memory. + The arguments are the register number in the next byte, and the + number of groups inner to this one in the next. The text + matched within the group is recorded (in the internal + registers data structure) under the register number. */ + case start_memory: + DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); + + /* Find out if this group can match the empty string. */ + p1 = p; /* To send to group_match_null_string_p. */ + + if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) + REG_MATCH_NULL_STRING_P (reg_info[*p]) + = group_match_null_string_p (&p1, pend, reg_info); + + /* Save the position in the string where we were the last time + we were at this open-group operator in case the group is + operated upon by a repetition operator, e.g., with `(a*)*b' + against `ab'; then we want to ignore where we are now in + the string in case this attempt to match fails. */ + old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) + ? REG_UNSET (regstart[*p]) ? d : regstart[*p] + : regstart[*p]; + DEBUG_PRINT2 (" old_regstart: %d\n", + POINTER_TO_OFFSET (old_regstart[*p])); + + regstart[*p] = d; + DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); + + IS_ACTIVE (reg_info[*p]) = 1; + MATCHED_SOMETHING (reg_info[*p]) = 0; + + /* This is the new highest active register. */ + highest_active_reg = *p; + + /* If nothing was active before, this is the new lowest active + register. */ + if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) + lowest_active_reg = *p; + + /* Move past the register number and inner group count. */ + p += 2; + break; + + + /* The stop_memory opcode represents the end of a group. Its + arguments are the same as start_memory's: the register + number, and the number of inner groups. */ + case stop_memory: + DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); + + /* We need to save the string position the last time we were at + this close-group operator in case the group is operated + upon by a repetition operator, e.g., with `((a*)*(b*)*)*' + against `aba'; then we want to ignore where we are now in + the string in case this attempt to match fails. */ + old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) + ? REG_UNSET (regend[*p]) ? d : regend[*p] + : regend[*p]; + DEBUG_PRINT2 (" old_regend: %d\n", + POINTER_TO_OFFSET (old_regend[*p])); + + regend[*p] = d; + DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); + + /* This register isn't active anymore. */ + IS_ACTIVE (reg_info[*p]) = 0; + + /* If this was the only register active, nothing is active + anymore. */ + if (lowest_active_reg == highest_active_reg) + { + lowest_active_reg = NO_LOWEST_ACTIVE_REG; + highest_active_reg = NO_HIGHEST_ACTIVE_REG; + } + else + { /* We must scan for the new highest active register, since + it isn't necessarily one less than now: consider + (a(b)c(d(e)f)g). When group 3 ends, after the f), the + new highest active register is 1. */ + unsigned char r = *p - 1; + while (r > 0 && !IS_ACTIVE (reg_info[r])) + r--; + + /* If we end up at register zero, that means that we saved + the registers as the result of an `on_failure_jump', not + a `start_memory', and we jumped to past the innermost + `stop_memory'. For example, in ((.)*) we save + registers 1 and 2 as a result of the *, but when we pop + back to the second ), we are at the stop_memory 1. + Thus, nothing is active. */ + if (r == 0) + { + lowest_active_reg = NO_LOWEST_ACTIVE_REG; + highest_active_reg = NO_HIGHEST_ACTIVE_REG; + } + else + highest_active_reg = r; + } + + /* If just failed to match something this time around with a + group that's operated on by a repetition operator, try to + force exit from the ``loop,'' and restore the register + information for this group that we had before trying this + last match. */ + if ((!MATCHED_SOMETHING (reg_info[*p]) + || (re_opcode_t) p[-3] == start_memory) + && (p + 2) < pend) + { + boolean is_a_jump_n = false; + + p1 = p + 2; + mcnt = 0; + switch ((re_opcode_t) *p1++) + { + case jump_n: + is_a_jump_n = true; + case pop_failure_jump: + case maybe_pop_jump: + case jump: + case dummy_failure_jump: + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + if (is_a_jump_n) + p1 += 2; + break; + + default: + /* do nothing */ ; + } + p1 += mcnt; + + /* If the next operation is a jump backwards in the pattern + to an on_failure_jump right before the start_memory + corresponding to this stop_memory, exit from the loop + by forcing a failure after pushing on the stack the + on_failure_jump's jump in the pattern, and d. */ + if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump + && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) + { + /* If this group ever matched anything, then restore + what its registers were before trying this last + failed match, e.g., with `(a*)*b' against `ab' for + regstart[1], and, e.g., with `((a*)*(b*)*)*' + against `aba' for regend[3]. + + Also restore the registers for inner groups for, + e.g., `((a*)(b*))*' against `aba' (register 3 would + otherwise get trashed). */ + + if (EVER_MATCHED_SOMETHING (reg_info[*p])) + { + unsigned r; + + EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; + + /* Restore this and inner groups' (if any) registers. */ + for (r = *p; r < *p + *(p + 1); r++) + { + regstart[r] = old_regstart[r]; + + /* xx why this test? */ + if ((int) old_regend[r] >= (int) regstart[r]) + regend[r] = old_regend[r]; + } + } + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + PUSH_FAILURE_POINT (p1 + mcnt, d, -2); + + goto fail; + } + } + + /* Move past the register number and the inner group count. */ + p += 2; + break; + + + /* \ has been turned into a `duplicate' command which is + followed by the numeric value of as the register number. */ + case duplicate: + { + register const char *d2, *dend2; + int regno = *p++; /* Get which register to match against. */ + DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); + + /* Can't back reference a group which we've never matched. */ + if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) + goto fail; + + /* Where in input to try to start matching. */ + d2 = regstart[regno]; + + /* Where to stop matching; if both the place to start and + the place to stop matching are in the same string, then + set to the place to stop, otherwise, for now have to use + the end of the first string. */ + + dend2 = ((FIRST_STRING_P (regstart[regno]) + == FIRST_STRING_P (regend[regno])) + ? regend[regno] : end_match_1); + for (;;) + { + /* If necessary, advance to next segment in register + contents. */ + while (d2 == dend2) + { + if (dend2 == end_match_2) break; + if (dend2 == regend[regno]) break; + + /* End of string1 => advance to string2. */ + d2 = string2; + dend2 = regend[regno]; + } + /* At end of register contents => success */ + if (d2 == dend2) break; + + /* If necessary, advance to next segment in data. */ + PREFETCH (); + + /* How many characters left in this segment to match. */ + mcnt = dend - d; + + /* Want how many consecutive characters we can match in + one shot, so, if necessary, adjust the count. */ + if (mcnt > dend2 - d2) + mcnt = dend2 - d2; + + /* Compare that many; failure if mismatch, else move + past them. */ + if (translate + ? bcmp_translate (d, d2, mcnt, translate) + : bcmp (d, d2, mcnt)) + goto fail; + d += mcnt, d2 += mcnt; + } + } + break; + + + /* begline matches the empty string at the beginning of the string + (unless `not_bol' is set in `bufp'), and, if + `newline_anchor' is set, after newlines. */ + case begline: + DEBUG_PRINT1 ("EXECUTING begline.\n"); + + if (AT_STRINGS_BEG ()) + { + if (!bufp->not_bol) break; + } + else if (d[-1] == '\n' && bufp->newline_anchor) + { + break; + } + /* In all other cases, we fail. */ + goto fail; + + + /* endline is the dual of begline. */ + case endline: + DEBUG_PRINT1 ("EXECUTING endline.\n"); + + if (AT_STRINGS_END ()) + { + if (!bufp->not_eol) break; + } + + /* We have to ``prefetch'' the next character. */ + else if ((d == end1 ? *string2 : *d) == '\n' + && bufp->newline_anchor) + { + break; + } + goto fail; + + + /* Match at the very beginning of the data. */ + case begbuf: + DEBUG_PRINT1 ("EXECUTING begbuf.\n"); + if (AT_STRINGS_BEG ()) + break; + goto fail; + + + /* Match at the very end of the data. */ + case endbuf: + DEBUG_PRINT1 ("EXECUTING endbuf.\n"); + if (AT_STRINGS_END ()) + break; + goto fail; + + + /* on_failure_keep_string_jump is used to optimize `.*\n'. It + pushes NULL as the value for the string on the stack. Then + `pop_failure_point' will keep the current value for the + string, instead of restoring it. To see why, consider + matching `foo\nbar' against `.*\n'. The .* matches the foo; + then the . fails against the \n. But the next thing we want + to do is match the \n against the \n; if we restored the + string value, we would be back at the foo. + + Because this is used only in specific cases, we don't need to + check all the things that `on_failure_jump' does, to make + sure the right things get saved on the stack. Hence we don't + share its code. The only reason to push anything on the + stack at all is that otherwise we would have to change + `anychar's code to do something besides goto fail in this + case; that seems worse than this. */ + case on_failure_keep_string_jump: + DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); + + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt); + + PUSH_FAILURE_POINT (p + mcnt, NULL, -2); + break; + + + /* Uses of on_failure_jump: + + Each alternative starts with an on_failure_jump that points + to the beginning of the next alternative. Each alternative + except the last ends with a jump that in effect jumps past + the rest of the alternatives. (They really jump to the + ending jump of the following alternative, because tensioning + these jumps is a hassle.) + + Repeats start with an on_failure_jump that points past both + the repetition text and either the following jump or + pop_failure_jump back to this on_failure_jump. */ + case on_failure_jump: + on_failure: + DEBUG_PRINT1 ("EXECUTING on_failure_jump"); + + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); + + /* If this on_failure_jump comes right before a group (i.e., + the original * applied to a group), save the information + for that group and all inner ones, so that if we fail back + to this point, the group's information will be correct. + For example, in \(a*\)*\1, we only need the preceding group, + and in \(\(a*\)b*\)\2, we need the inner group. */ + + /* We can't use `p' to check ahead because we push + a failure point to `p + mcnt' after we do this. */ + p1 = p; + + /* We need to skip no_op's before we look for the + start_memory in case this on_failure_jump is happening as + the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 + against aba. */ + while (p1 < pend && (re_opcode_t) *p1 == no_op) + p1++; + + if (p1 < pend && (re_opcode_t) *p1 == start_memory) + { + /* We have a new highest active register now. This will + get reset at the start_memory we are about to get to, + but we will have saved all the registers relevant to + this repetition op, as described above. */ + highest_active_reg = *(p1 + 1) + *(p1 + 2); + if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) + lowest_active_reg = *(p1 + 1); + } + + DEBUG_PRINT1 (":\n"); + PUSH_FAILURE_POINT (p + mcnt, d, -2); + break; + + + /* A smart repeat ends with a maybe_pop_jump. + We change it either to a pop_failure_jump or a jump. */ + case maybe_pop_jump: + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); + { + register unsigned char *p2 = p; + + /* Compare the beginning of the repeat with what in the + pattern follows its end. If we can establish that there + is nothing that they would both match, i.e., that we + would have to backtrack because of (as in, e.g., `a*a') + then we can change to pop_failure_jump, because we'll + never have to backtrack. + + This is not true in the case of alternatives: in + `(a|ab)*' we do need to backtrack to the `ab' alternative + (e.g., if the string was `ab'). But instead of trying to + detect that here, the alternative has put on a dummy + failure point which is what we will end up popping. */ + + /* Skip over open/close-group commands. */ + while (p2 + 2 < pend + && ((re_opcode_t) *p2 == stop_memory + || (re_opcode_t) *p2 == start_memory)) + p2 += 3; /* Skip over args, too. */ + + /* If we're at the end of the pattern, we can change. */ + if (p2 == pend) + { + p[-3] = (unsigned char) pop_failure_jump; + DEBUG_PRINT1 + (" End of pattern: change to `pop_failure_jump'.\n"); + } + + else if ((re_opcode_t) *p2 == exactn + || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) + { + register unsigned char c + = *p2 == (unsigned char) endline ? '\n' : p2[2]; + p1 = p + mcnt; + + /* p1[0] ... p1[2] are the `on_failure_jump' corresponding + to the `maybe_finalize_jump' of this case. Examine what + follows. */ + if ((re_opcode_t) p1[3] == exactn && p1[5] != c) + p[-3] = (unsigned char) pop_failure_jump; + else if ((re_opcode_t) p1[3] == charset + || (re_opcode_t) p1[3] == charset_not) + { + int not = (re_opcode_t) p1[3] == charset_not; + + if (c < (unsigned char) (p1[4] * BYTEWIDTH) + && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) + not = !not; + + /* `not' is equal to 1 if c would match, which means + that we can't change to pop_failure_jump. */ + if (!not) + { + p[-3] = (unsigned char) pop_failure_jump; + DEBUG_PRINT1 + (" No match: change to `pop_failure_jump'.\n"); + + } + } + } + } + p -= 2; /* Point at relative address again. */ + if ((re_opcode_t) p[-1] != pop_failure_jump) + { + p[-1] = (unsigned char) jump; + goto unconditional_jump; + } + /* Note fall through. */ + + + /* The end of a simple repeat has a pop_failure_jump back to + its matching on_failure_jump, where the latter will push a + failure point. The pop_failure_jump takes off failure + points put on by this pop_failure_jump's matching + on_failure_jump; we got through the pattern to here from the + matching on_failure_jump, so didn't fail. */ + case pop_failure_jump: + { + /* We need to pass separate storage for the lowest and + highest registers, even though we don't care about the + actual values. Otherwise, we will restore only one + register from the stack, since lowest will == highest in + `pop_failure_point'. */ + unsigned dummy_low_reg, dummy_high_reg; + unsigned char *pdummy; + const char *sdummy; + + DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); + POP_FAILURE_POINT (sdummy, pdummy, + dummy_low_reg, dummy_high_reg, + reg_dummy, reg_dummy, reg_info_dummy); + } + /* Note fall through. */ + + + /* Unconditionally jump (without popping any failure points). */ + case jump: + unconditional_jump: + EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ + DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); + p += mcnt; /* Do the jump. */ + DEBUG_PRINT2 ("(to 0x%x).\n", p); + break; + + + /* We need this opcode so we can detect where alternatives end + in `group_match_null_string_p' et al. */ + case jump_past_alt: + DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); + goto unconditional_jump; + + + /* Normally, the on_failure_jump pushes a failure point, which + then gets popped at pop_failure_jump. We will end up at + pop_failure_jump, also, and with a pattern of, say, `a+', we + are skipping over the on_failure_jump, so we have to push + something meaningless for pop_failure_jump to pop. */ + case dummy_failure_jump: + DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); + /* It doesn't matter what we push for the string here. What + the code at `fail' tests is the value for the pattern. */ + PUSH_FAILURE_POINT (0, 0, -2); + goto unconditional_jump; + + + /* At the end of an alternative, we need to push a dummy failure + point in case we are followed by a pop_failure_jump', because + we don't want the failure point for the alternative to be + popped. For example, matching `(a|ab)*' against `aab' + requires that we match the `ab' alternative. */ + case push_dummy_failure: + DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); + /* See comments just above at `dummy_failure_jump' about the + two zeroes. */ + PUSH_FAILURE_POINT (0, 0, -2); + break; + + /* Have to succeed matching what follows at least n times. + After that, handle like `on_failure_jump'. */ + case succeed_n: + EXTRACT_NUMBER (mcnt, p + 2); + DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); + + assert (mcnt >= 0); + /* Originally, this is how many times we HAVE to succeed. */ + if (mcnt > 0) + { + mcnt--; + p += 2; + STORE_NUMBER_AND_INCR (p, mcnt); + DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt); + } + else if (mcnt == 0) + { + DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2); + p[2] = (unsigned char) no_op; + p[3] = (unsigned char) no_op; + goto on_failure; + } + break; + + case jump_n: + EXTRACT_NUMBER (mcnt, p + 2); + DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); + + /* Originally, this is how many times we CAN jump. */ + if (mcnt) + { + mcnt--; + STORE_NUMBER (p + 2, mcnt); + goto unconditional_jump; + } + /* If don't have to jump any more, skip over the rest of command. */ + else + p += 4; + break; + + case set_number_at: + { + DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); + + EXTRACT_NUMBER_AND_INCR (mcnt, p); + p1 = p + mcnt; + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt); + STORE_NUMBER (p1, mcnt); + break; + } + + case wordbound: + DEBUG_PRINT1 ("EXECUTING wordbound.\n"); + if (AT_WORD_BOUNDARY (d)) + break; + goto fail; + + case notwordbound: + DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); + if (AT_WORD_BOUNDARY (d)) + goto fail; + break; + + case wordbeg: + DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); + if (LETTER_P (d) && (AT_STRINGS_BEG () || !LETTER_P (d - 1))) + break; + goto fail; + + case wordend: + DEBUG_PRINT1 ("EXECUTING wordend.\n"); + if (!AT_STRINGS_BEG () && LETTER_P (d - 1) + && (!LETTER_P (d) || AT_STRINGS_END ())) + break; + goto fail; + +#ifdef emacs +#ifdef emacs19 + case before_dot: + DEBUG_PRINT1 ("EXECUTING before_dot.\n"); + if (PTR_CHAR_POS ((unsigned char *) d) >= point) + goto fail; + break; + + case at_dot: + DEBUG_PRINT1 ("EXECUTING at_dot.\n"); + if (PTR_CHAR_POS ((unsigned char *) d) != point) + goto fail; + break; + + case after_dot: + DEBUG_PRINT1 ("EXECUTING after_dot.\n"); + if (PTR_CHAR_POS ((unsigned char *) d) <= point) + goto fail; + break; +#else /* not emacs19 */ + case at_dot: + DEBUG_PRINT1 ("EXECUTING at_dot.\n"); + if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point) + goto fail; + break; +#endif /* not emacs19 */ + + case syntaxspec: + DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); + mcnt = *p++; + goto matchsyntax; + + case wordchar: + DEBUG_PRINT1 ("EXECUTING wordchar.\n"); + mcnt = (int) Sword; + matchsyntax: + PREFETCH (); + if (SYNTAX (*d++) != (enum syntaxcode) mcnt) goto fail; + SET_REGS_MATCHED (); + break; + + case notsyntaxspec: + DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); + mcnt = *p++; + goto matchnotsyntax; + + case notwordchar: + DEBUG_PRINT1 ("EXECUTING notwordchar.\n"); + mcnt = (int) Sword; + matchnotsyntax: /* We goto here from notsyntaxspec. */ + PREFETCH (); + if (SYNTAX (*d++) == (enum syntaxcode) mcnt) goto fail; + SET_REGS_MATCHED (); + break; + +#else /* not emacs */ + case wordchar: + DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); + PREFETCH (); + if (!LETTER_P (d)) + goto fail; + SET_REGS_MATCHED (); + break; + + case notwordchar: + DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); + PREFETCH (); + if (LETTER_P (d)) + goto fail; + SET_REGS_MATCHED (); + break; +#endif /* not emacs */ + + default: + abort (); + } + continue; /* Successfully executed one pattern command; keep going. */ + + + /* We goto here if a matching operation fails. */ + fail: + if (!FAIL_STACK_EMPTY ()) + { /* A restart point is known. Restore to that state. */ + DEBUG_PRINT1 ("\nFAIL:\n"); + POP_FAILURE_POINT (d, p, + lowest_active_reg, highest_active_reg, + regstart, regend, reg_info); + + /* If this failure point is a dummy, try the next one. */ + if (!p) + goto fail; + + /* If we failed to the end of the pattern, don't examine *p. */ + assert (p <= pend); + if (p < pend) + { + boolean is_a_jump_n = false; + + /* If failed to a backwards jump that's part of a repetition + loop, need to pop this failure point and use the next one. */ + switch ((re_opcode_t) *p) + { + case jump_n: + is_a_jump_n = true; + case maybe_pop_jump: + case pop_failure_jump: + case jump: + p1 = p + 1; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + + if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) + || (!is_a_jump_n + && (re_opcode_t) *p1 == on_failure_jump)) + goto fail; + break; + default: + /* do nothing */ ; + } + } + + if (d >= string1 && d <= end1) + dend = end_match_1; + } + else + break; /* Matching at this starting point really fails. */ + } /* for (;;) */ + + if (best_regs_set) + goto restore_best_regs; + + FREE_VARIABLES (); + + return -1; /* Failure to match. */ +} /* re_match_2 */ + +/* Subroutine definitions for re_match_2. */ + + +/* We are passed P pointing to a register number after a start_memory. + + Return true if the pattern up to the corresponding stop_memory can + match the empty string, and false otherwise. + + If we find the matching stop_memory, sets P to point to one past its number. + Otherwise, sets P to an undefined byte less than or equal to END. + + We don't handle duplicates properly (yet). */ + +static boolean +group_match_null_string_p (p, end, reg_info) + unsigned char **p, *end; + register_info_type *reg_info; +{ + int mcnt; + /* Point to after the args to the start_memory. */ + unsigned char *p1 = *p + 2; + + while (p1 < end) + { + /* Skip over opcodes that can match nothing, and return true or + false, as appropriate, when we get to one that can't, or to the + matching stop_memory. */ + + switch ((re_opcode_t) *p1) + { + /* Could be either a loop or a series of alternatives. */ + case on_failure_jump: + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + + /* If the next operation is not a jump backwards in the + pattern. */ + + if (mcnt >= 0) + { + /* Go through the on_failure_jumps of the alternatives, + seeing if any of the alternatives cannot match nothing. + The last alternative starts with only a jump, + whereas the rest start with on_failure_jump and end + with a jump, e.g., here is the pattern for `a|b|c': + + /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 + /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 + /exactn/1/c + + So, we have to first go through the first (n-1) + alternatives and then deal with the last one separately. */ + + + /* Deal with the first (n-1) alternatives, which start + with an on_failure_jump (see above) that jumps to right + past a jump_past_alt. */ + + while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) + { + /* `mcnt' holds how many bytes long the alternative + is, including the ending `jump_past_alt' and + its number. */ + + if (!alt_match_null_string_p (p1, p1 + mcnt - 3, + reg_info)) + return false; + + /* Move to right after this alternative, including the + jump_past_alt. */ + p1 += mcnt; + + /* Break if it's the beginning of an n-th alternative + that doesn't begin with an on_failure_jump. */ + if ((re_opcode_t) *p1 != on_failure_jump) + break; + + /* Still have to check that it's not an n-th + alternative that starts with an on_failure_jump. */ + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) + { + /* Get to the beginning of the n-th alternative. */ + p1 -= 3; + break; + } + } + + /* Deal with the last alternative: go back and get number + of the `jump_past_alt' just before it. `mcnt' contains + the length of the alternative. */ + EXTRACT_NUMBER (mcnt, p1 - 2); + + if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) + return false; + + p1 += mcnt; /* Get past the n-th alternative. */ + } /* if mcnt > 0 */ + break; + + + case stop_memory: + assert (p1[1] == **p); + *p = p1 + 2; + return true; + + + default: + if (!common_op_match_null_string_p (&p1, end, reg_info)) + return false; + } + } /* while p1 < end */ + + return false; +} /* group_match_null_string_p */ + + +/* Similar to group_match_null_string_p, but doesn't deal with alternatives: + It expects P to be the first byte of a single alternative and END one + byte past the last. The alternative can contain groups. */ + +static boolean +alt_match_null_string_p (p, end, reg_info) + unsigned char *p, *end; + register_info_type *reg_info; +{ + int mcnt; + unsigned char *p1 = p; + + while (p1 < end) + { + /* Skip over opcodes that can match nothing, and break when we get + to one that can't. */ + + switch ((re_opcode_t) *p1) + { + /* It's a loop. */ + case on_failure_jump: + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + break; + + default: + if (!common_op_match_null_string_p (&p1, end, reg_info)) + return false; + } + } /* while p1 < end */ + + return true; +} /* alt_match_null_string_p */ + + +/* Deals with the ops common to group_match_null_string_p and + alt_match_null_string_p. + + Sets P to one after the op and its arguments, if any. */ + +static boolean +common_op_match_null_string_p (p, end, reg_info) + unsigned char **p, *end; + register_info_type *reg_info; +{ + int mcnt; + boolean ret; + int reg_no; + unsigned char *p1 = *p; + + switch ((re_opcode_t) *p1++) + { + case no_op: + case begline: + case endline: + case begbuf: + case endbuf: + case wordbeg: + case wordend: + case wordbound: + case notwordbound: +#ifdef emacs + case before_dot: + case at_dot: + case after_dot: +#endif + break; + + case start_memory: + reg_no = *p1; + assert (reg_no > 0 && reg_no <= MAX_REGNUM); + ret = group_match_null_string_p (&p1, end, reg_info); + + /* Have to set this here in case we're checking a group which + contains a group and a back reference to it. */ + + if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) + REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; + + if (!ret) + return false; + break; + + /* If this is an optimized succeed_n for zero times, make the jump. */ + case jump: + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + if (mcnt >= 0) + p1 += mcnt; + else + return false; + break; + + case succeed_n: + /* Get to the number of times to succeed. */ + p1 += 2; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + + if (mcnt == 0) + { + p1 -= 4; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + } + else + return false; + break; + + case duplicate: + if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) + return false; + break; + + case set_number_at: + p1 += 4; + + default: + /* All other opcodes mean we cannot match the empty string. */ + return false; + } + + *p = p1; + return true; +} /* common_op_match_null_string_p */ + + +/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN + bytes; nonzero otherwise. */ + +static int +bcmp_translate (s1, s2, len, translate) + unsigned char *s1, *s2; + register int len; + char *translate; +{ + register unsigned char *p1 = s1, *p2 = s2; + while (len) + { + if (translate[*p1++] != translate[*p2++]) return 1; + len--; + } + return 0; +} + +/* Entry points for GNU code. */ + +/* re_compile_pattern is the GNU regular expression compiler: it + compiles PATTERN (of length SIZE) and puts the result in BUFP. + Returns 0 if the pattern was valid, otherwise an error string. + + Assumes the `allocated' (and perhaps `buffer') and `translate' fields + are set in BUFP on entry. + + We call regex_compile to do the actual compilation. */ + +const char * +re_compile_pattern (pattern, length, bufp) + const char *pattern; + int length; + struct re_pattern_buffer *bufp; +{ + reg_errcode_t ret; + + /* GNU code is written to assume at least RE_NREGS registers will be set + (and at least one extra will be -1). */ + bufp->regs_allocated = REGS_UNALLOCATED; + + /* And GNU code determines whether or not to get register information + by passing null for the REGS argument to re_match, etc., not by + setting no_sub. */ + bufp->no_sub = 0; + + /* Match anchors at newline. */ + bufp->newline_anchor = 1; + + ret = regex_compile (pattern, length, re_syntax_options, bufp); + + return re_error_msg[(int) ret]; +} + +/* Entry points compatible with 4.2 BSD regex library. We don't define + them if this is an Emacs or POSIX compilation. */ + +#if !defined (emacs) && !defined (_POSIX_SOURCE) + +/* BSD has one and only one pattern buffer. */ +static struct re_pattern_buffer re_comp_buf; + +char * +re_comp (s) + const char *s; +{ + reg_errcode_t ret; + + if (!s) + { + if (!re_comp_buf.buffer) + return "No previous regular expression"; + return 0; + } + + if (!re_comp_buf.buffer) + { + re_comp_buf.buffer = (unsigned char *) malloc (200); + if (re_comp_buf.buffer == NULL) + return "Memory exhausted"; + re_comp_buf.allocated = 200; + + re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH); + if (re_comp_buf.fastmap == NULL) + return "Memory exhausted"; + } + + /* Since `re_exec' always passes NULL for the `regs' argument, we + don't need to initialize the pattern buffer fields which affect it. */ + + /* Match anchors at newlines. */ + re_comp_buf.newline_anchor = 1; + + ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); + + /* Yes, we're discarding `const' here. */ + return (char *) re_error_msg[(int) ret]; +} + + +int +re_exec (s) + const char *s; +{ + const int len = strlen (s); + return + 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); +} +#endif /* not emacs and not _POSIX_SOURCE */ + +/* POSIX.2 functions. Don't define these for Emacs. */ + +#ifndef emacs + +/* regcomp takes a regular expression as a string and compiles it. + + PREG is a regex_t *. We do not expect any fields to be initialized, + since POSIX says we shouldn't. Thus, we set + + `buffer' to the compiled pattern; + `used' to the length of the compiled pattern; + `syntax' to RE_SYNTAX_POSIX_EXTENDED if the + REG_EXTENDED bit in CFLAGS is set; otherwise, to + RE_SYNTAX_POSIX_BASIC; + `newline_anchor' to REG_NEWLINE being set in CFLAGS; + `fastmap' and `fastmap_accurate' to zero; + `re_nsub' to the number of subexpressions in PATTERN. + + PATTERN is the address of the pattern string. + + CFLAGS is a series of bits which affect compilation. + + If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we + use POSIX basic syntax. + + If REG_NEWLINE is set, then . and [^...] don't match newline. + Also, regexec will try a match beginning after every newline. + + If REG_ICASE is set, then we considers upper- and lowercase + versions of letters to be equivalent when matching. + + If REG_NOSUB is set, then when PREG is passed to regexec, that + routine will report only success or failure, and nothing about the + registers. + + It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for + the return codes and their meanings.) */ + +int +regcomp (preg, pattern, cflags) + regex_t *preg; + const char *pattern; + int cflags; +{ + reg_errcode_t ret; + unsigned syntax + = cflags & REG_EXTENDED ? RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; + + /* regex_compile will allocate the space for the compiled pattern. */ + preg->buffer = 0; + + /* Don't bother to use a fastmap when searching. This simplifies the + REG_NEWLINE case: if we used a fastmap, we'd have to put all the + characters after newlines into the fastmap. This way, we just try + every character. */ + preg->fastmap = 0; + + if (cflags & REG_ICASE) + { + unsigned i; + + preg->translate = (char *) malloc (CHAR_SET_SIZE); + if (preg->translate == NULL) + return (int) REG_ESPACE; + + /* Map uppercase characters to corresponding lowercase ones. */ + for (i = 0; i < CHAR_SET_SIZE; i++) + preg->translate[i] = isupper (i) ? tolower (i) : i; + } + else + preg->translate = NULL; + + /* If REG_NEWLINE is set, newlines are treated differently. */ + if (cflags & REG_NEWLINE) + { /* REG_NEWLINE implies neither . nor [^...] match newline. */ + syntax &= ~RE_DOT_NEWLINE; + syntax |= RE_HAT_LISTS_NOT_NEWLINE; + /* It also changes the matching behavior. */ + preg->newline_anchor = 1; + } + else + preg->newline_anchor = 0; + + preg->no_sub = !!(cflags & REG_NOSUB); + + /* POSIX says a null character in the pattern terminates it, so we + can use strlen here in compiling the pattern. */ + ret = regex_compile (pattern, strlen (pattern), syntax, preg); + + /* POSIX doesn't distinguish between an unmatched open-group and an + unmatched close-group: both are REG_EPAREN. */ + if (ret == REG_ERPAREN) ret = REG_EPAREN; + + return (int) ret; +} + + +/* regexec searches for a given pattern, specified by PREG, in the + string STRING. + + If NMATCH is zero or REG_NOSUB was set in the cflags argument to + `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at + least NMATCH elements, and we set them to the offsets of the + corresponding matched substrings. + + EFLAGS specifies `execution flags' which affect matching: if + REG_NOTBOL is set, then ^ does not match at the beginning of the + string; if REG_NOTEOL is set, then $ does not match at the end. + + We return 0 if we find a match and REG_NOMATCH if not. */ + +int +regexec (preg, string, nmatch, pmatch, eflags) + const regex_t *preg; + const char *string; + size_t nmatch; + regmatch_t pmatch[]; + int eflags; +{ + int ret; + struct re_registers regs; + regex_t private_preg; + int len = strlen (string); + boolean want_reg_info = !preg->no_sub && nmatch > 0; + + private_preg = *preg; + + private_preg.not_bol = !!(eflags & REG_NOTBOL); + private_preg.not_eol = !!(eflags & REG_NOTEOL); + + /* The user has told us exactly how many registers to return + information about, via `nmatch'. We have to pass that on to the + matching routines. */ + private_preg.regs_allocated = REGS_FIXED; + + if (want_reg_info) + { + regs.num_regs = nmatch; + regs.start = TALLOC (nmatch, regoff_t); + regs.end = TALLOC (nmatch, regoff_t); + if (regs.start == NULL || regs.end == NULL) + return (int) REG_NOMATCH; + } + + /* Perform the searching operation. */ + ret = re_search (&private_preg, string, len, + /* start: */ 0, /* range: */ len, + want_reg_info ? ®s : (struct re_registers *) 0); + + /* Copy the register information to the POSIX structure. */ + if (want_reg_info) + { + if (ret >= 0) + { + unsigned r; + + for (r = 0; r < nmatch; r++) + { + pmatch[r].rm_so = regs.start[r]; + pmatch[r].rm_eo = regs.end[r]; + } + } + + /* If we needed the temporary register info, free the space now. */ + free (regs.start); + free (regs.end); + } + + /* We want zero return to mean success, unlike `re_search'. */ + return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; +} + + +/* Returns a message corresponding to an error code, ERRCODE, returned + from either regcomp or regexec. */ + +size_t +regerror (errcode, preg, errbuf, errbuf_size) + int errcode; + const regex_t *preg; + char *errbuf; + size_t errbuf_size; +{ + const char *msg + = re_error_msg[errcode] == NULL ? "Success" : re_error_msg[errcode]; + size_t msg_size = strlen (msg) + 1; /* Includes the null. */ + + if (errbuf_size != 0) + { + if (msg_size > errbuf_size) + { + strncpy (errbuf, msg, errbuf_size - 1); + errbuf[errbuf_size - 1] = 0; + } + else + strcpy (errbuf, msg); + } + + return msg_size; +} + + +/* Free dynamically allocated space used by PREG. */ + +void +regfree (preg) + regex_t *preg; +{ + if (preg->buffer != NULL) + free (preg->buffer); + preg->buffer = NULL; + + preg->allocated = 0; + preg->used = 0; + + if (preg->fastmap != NULL) + free (preg->fastmap); + preg->fastmap = NULL; + preg->fastmap_accurate = 0; + + if (preg->translate != NULL) + free (preg->translate); + preg->translate = NULL; +} + +#endif /* not emacs */ + +/* +Local variables: +make-backup-files: t +version-control: t +trim-versions-without-asking: nil +End: +*/ diff --git a/lib/regex.h b/lib/regex.h new file mode 100644 index 0000000..87824ef --- /dev/null +++ b/lib/regex.h @@ -0,0 +1,481 @@ +/* Definitions for data structures and routines for the regular + expression library, version 0.11. + + Copyright (C) 1985, 89, 90, 91, 92 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#ifndef __REGEXP_LIBRARY_H__ +#define __REGEXP_LIBRARY_H__ + +/* POSIX says that must be included before . */ + +/* The following bits are used to determine the regexp syntax we + recognize. The set/not-set meanings are chosen so that Emacs syntax + remains the value 0. The bits are given in alphabetical order, and + the definitions shifted by one from the previous bit; thus, when we + add or remove a bit, only one other definition need change. */ +typedef unsigned reg_syntax_t; + +/* If this bit is not set, then \ inside a bracket expression is literal. + If set, then such a \ quotes the following character. */ +#define RE_BACKSLASH_ESCAPE_IN_LISTS (1) + +/* If this bit is not set, then + and ? are operators, and \+ and \? are + literals. + If set, then \+ and \? are operators and + and ? are literals. */ +#define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) + +/* If this bit is set, then character classes are supported. They are: + [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], + [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:]. + If not set, then character classes are not supported. */ +#define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1) + +/* If this bit is set, then ^ and $ are always anchors (outside bracket + expressions, of course). + If this bit is not set, then it depends: + ^ is an anchor if it is at the beginning of a regular + expression or after an open-group or an alternation operator; + $ is an anchor if it is at the end of a regular expression, or + before a close-group or an alternation operator. + + This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because + POSIX draft 11.2 says that * etc. in leading positions is undefined. + We already implemented a previous draft which made those constructs + invalid, though, so we haven't changed the code back. */ +#define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1) + +/* If this bit is set, then special characters are always special + regardless of where they are in the pattern. + If this bit is not set, then special characters are special only in + some contexts; otherwise they are ordinary. Specifically, + * + ? and intervals are only special when not after the beginning, + open-group, or alternation operator. */ +#define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) + +/* If this bit is set, then *, +, ?, and { cannot be first in an re or + immediately after an alternation or begin-group operator. */ +#define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1) + +/* If this bit is set, then . matches newline. + If not set, then it doesn't. */ +#define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1) + +/* If this bit is set, then . doesn't match NUL. + If not set, then it does. */ +#define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1) + +/* If this bit is set, nonmatching lists [^...] do not match newline. + If not set, they do. */ +#define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) + +/* If this bit is set, either \{...\} or {...} defines an + interval, depending on RE_NO_BK_BRACES. + If not set, \{, \}, {, and } are literals. */ +#define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) + +/* If this bit is set, +, ? and | aren't recognized as operators. + If not set, they are. */ +#define RE_LIMITED_OPS (RE_INTERVALS << 1) + +/* If this bit is set, newline is an alternation operator. + If not set, newline is literal. */ +#define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1) + +/* If this bit is set, then `{...}' defines an interval, and \{ and \} + are literals. + If not set, then `\{...\}' defines an interval. */ +#define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1) + +/* If this bit is set, (...) defines a group, and \( and \) are literals. + If not set, \(...\) defines a group, and ( and ) are literals. */ +#define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1) + +/* If this bit is set, then \ matches . + If not set, then \ is a back-reference. */ +#define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) + +/* If this bit is set, then | is an alternation operator, and \| is literal. + If not set, then \| is an alternation operator, and | is literal. */ +#define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) + +/* If this bit is set, then an ending range point collating higher + than the starting range point, as in [z-a], is invalid. + If not set, then when ending range point collates higher than the + starting range point, the range is ignored. */ +#define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1) + +/* If this bit is set, then an unmatched ) is ordinary. + If not set, then an unmatched ) is invalid. */ +#define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1) + +/* This global variable defines the particular regexp syntax to use (for + some interfaces). When a regexp is compiled, the syntax used is + stored in the pattern buffer, so changing this does not affect + already-compiled regexps. */ +extern reg_syntax_t re_syntax_options; + +/* Define combinations of the above bits for the standard possibilities. + (The [[[ comments delimit what gets put into the Texinfo file, so + don't delete them!) */ +/* [[[begin syntaxes]]] */ +#define RE_SYNTAX_EMACS 0 + +#define RE_SYNTAX_AWK \ + (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ + | RE_NO_BK_PARENS | RE_NO_BK_REFS \ + | RE_NO_BK_VAR | RE_NO_EMPTY_RANGES \ + | RE_UNMATCHED_RIGHT_PAREN_ORD) + +#define RE_SYNTAX_POSIX_AWK \ + (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS) + +#define RE_SYNTAX_GREP \ + (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ + | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \ + | RE_NEWLINE_ALT) + +#define RE_SYNTAX_EGREP \ + (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \ + | RE_NEWLINE_ALT | RE_NO_BK_PARENS \ + | RE_NO_BK_VBAR) + +#define RE_SYNTAX_POSIX_EGREP \ + (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES) + +#define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC + +/* Syntax bits common to both basic and extended POSIX regex syntax. */ +#define _RE_SYNTAX_POSIX_COMMON \ + (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \ + | RE_INTERVALS | RE_NO_EMPTY_RANGES) + +#define RE_SYNTAX_POSIX_BASIC \ + (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM) + +/* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes + RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this + isn't minimal, since other operators, such as \`, aren't disabled. */ +#define RE_SYNTAX_POSIX_MINIMAL_BASIC \ + (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) + +#define RE_SYNTAX_POSIX_EXTENDED \ + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ + | RE_UNMATCHED_RIGHT_PAREN_ORD) + +/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS + replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ +#define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \ + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_REFS \ + | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) +/* [[[end syntaxes]]] */ + +/* Maximum number of duplicates an interval can allow. Some systems + (erroneously) define this in other header files, but we want our + value, so remove any previous define. */ +#ifdef RE_DUP_MAX +#undef RE_DUP_MAX +#endif +#define RE_DUP_MAX ((1 << 15) - 1) + + +/* POSIX `cflags' bits (i.e., information for `regcomp'). */ + +/* If this bit is set, then use extended regular expression syntax. + If not set, then use basic regular expression syntax. */ +#define REG_EXTENDED 1 + +/* If this bit is set, then ignore case when matching. + If not set, then case is significant. */ +#define REG_ICASE (REG_EXTENDED << 1) + +/* If this bit is set, then anchors do not match at newline + characters in the string. + If not set, then anchors do match at newlines. */ +#define REG_NEWLINE (REG_ICASE << 1) + +/* If this bit is set, then report only success or fail in regexec. + If not set, then returns differ between not matching and errors. */ +#define REG_NOSUB (REG_NEWLINE << 1) + + +/* POSIX `eflags' bits (i.e., information for regexec). */ + +/* If this bit is set, then the beginning-of-line operator doesn't match + the beginning of the string (presumably because it's not the + beginning of a line). + If not set, then the beginning-of-line operator does match the + beginning of the string. */ +#define REG_NOTBOL 1 + +/* Like REG_NOTBOL, except for the end-of-line. */ +#define REG_NOTEOL (1 << 1) + + +/* If any error codes are removed, changed, or added, update the + `re_error_msg' table in regex.c. */ +typedef enum +{ + REG_NOERROR = 0, /* Success. */ + REG_NOMATCH, /* Didn't find a match (for regexec). */ + + /* POSIX regcomp return error codes. (In the order listed in the + standard.) */ + REG_BADPAT, /* Invalid pattern. */ + REG_ECOLLATE, /* Not implemented. */ + REG_ECTYPE, /* Invalid character class name. */ + REG_EESCAPE, /* Trailing backslash. */ + REG_ESUBREG, /* Invalid back reference. */ + REG_EBRACK, /* Unmatched left bracket. */ + REG_EPAREN, /* Parenthesis imbalance. */ + REG_EBRACE, /* Unmatched \{. */ + REG_BADBR, /* Invalid contents of \{\}. */ + REG_ERANGE, /* Invalid range end. */ + REG_ESPACE, /* Ran out of memory. */ + REG_BADRPT, /* No preceding re for repetition op. */ + + /* Error codes we've added. */ + REG_EEND, /* Premature end. */ + REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */ + REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */ +} reg_errcode_t; + +/* This data structure represents a compiled pattern. Before calling + the pattern compiler, the fields `buffer', `allocated', `fastmap', + `translate', and `no_sub' can be set. After the pattern has been + compiled, the `re_nsub' field is available. All other fields are + private to the regex routines. */ + +struct re_pattern_buffer +{ +/* [[[begin pattern_buffer]]] */ + /* Space that holds the compiled pattern. It is declared as + `unsigned char *' because its elements are + sometimes used as array indexes. */ + unsigned char *buffer; + + /* Number of bytes to which `buffer' points. */ + unsigned long allocated; + + /* Number of bytes actually used in `buffer'. */ + unsigned long used; + + /* Syntax setting with which the pattern was compiled. */ + reg_syntax_t syntax; + + /* Pointer to a fastmap, if any, otherwise zero. re_search uses + the fastmap, if there is one, to skip over impossible + starting points for matches. */ + char *fastmap; + + /* Either a translate table to apply to all characters before + comparing them, or zero for no translation. The translation + is applied to a pattern when it is compiled and to a string + when it is matched. */ + char *translate; + + /* Number of subexpressions found by the compiler. */ + size_t re_nsub; + + /* Zero if this pattern cannot match the empty string, one else. + Well, in truth it's used only in `re_search_2', to see + whether or not we should use the fastmap, so we don't set + this absolutely perfectly; see `re_compile_fastmap' (the + `duplicate' case). */ + unsigned can_be_null : 1; + + /* If REGS_UNALLOCATED, allocate space in the `regs' structure + for `max (RE_NREGS, re_nsub + 1)' groups. + If REGS_REALLOCATE, reallocate space if necessary. + If REGS_FIXED, use what's there. */ +#define REGS_UNALLOCATED 0 +#define REGS_REALLOCATE 1 +#define REGS_FIXED 2 + unsigned regs_allocated : 2; + + /* Set to zero when regex_compile compiles a pattern; set to one + by re_compile_fastmap when it updates the fastmap, if any. */ + unsigned fastmap_accurate : 1; + + /* If set, regexec reports only success or failure and does not + return anything in pmatch. */ + unsigned no_sub : 1; + + /* If set, a beginning-of-line anchor doesn't match at the + beginning of the string. */ + unsigned not_bol : 1; + + /* Similarly for an end-of-line anchor. */ + unsigned not_eol : 1; + + /* If true, an anchor at a newline matches. */ + unsigned newline_anchor : 1; + +/* [[[end pattern_buffer]]] */ +}; + +typedef struct re_pattern_buffer regex_t; + + +/* search.c (search_buffer) in Emacs needs this one opcode value. It is + defined both in `regex.c' and here. */ +#define RE_EXACTN_VALUE 1 + +/* Type for byte offsets within the string. POSIX mandates this. */ +typedef int regoff_t; + + +/* This is the structure we store register match data in. See + regex.texinfo for a full description of what registers match. */ +struct re_registers +{ + unsigned num_regs; + regoff_t *start; + regoff_t *end; +}; + + +/* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer, + `re_match_2' returns information about at least this many registers + the first time a `regs' structure is passed. */ +#ifndef RE_NREGS +#define RE_NREGS 30 +#endif + + +/* POSIX specification for registers. Aside from the different names than + `re_registers', POSIX uses an array of structures, instead of a + structure of arrays. */ +typedef struct +{ + regoff_t rm_so; /* Byte offset from string's start to substring's start. */ + regoff_t rm_eo; /* Byte offset from string's start to substring's end. */ +} regmatch_t; + +/* Declarations for routines. */ + +/* To avoid duplicating every routine declaration -- once with a + prototype (if we are ANSI), and once without (if we aren't) -- we + use the following macro to declare argument types. This + unfortunately clutters up the declarations a bit, but I think it's + worth it. + + We also have to undo `const' if we are not ANSI and if it hasn't + previously being taken care of. */ + +#if __STDC__ +#define _RE_ARGS(args) args +#else +#define _RE_ARGS(args) () +#ifndef const +#define const +#endif +#endif + +/* Sets the current default syntax to SYNTAX, and return the old syntax. + You can also simply assign to the `re_syntax_options' variable. */ +extern reg_syntax_t re_set_syntax _RE_ARGS ((reg_syntax_t syntax)); + +/* Compile the regular expression PATTERN, with length LENGTH + and syntax given by the global `re_syntax_options', into the buffer + BUFFER. Return NULL if successful, and an error string if not. */ +extern const char *re_compile_pattern + _RE_ARGS ((const char *pattern, int length, + struct re_pattern_buffer *buffer)); + + +/* Compile a fastmap for the compiled pattern in BUFFER; used to + accelerate searches. Return 0 if successful and -2 if was an + internal error. */ +extern int re_compile_fastmap _RE_ARGS ((struct re_pattern_buffer *buffer)); + + +/* Search in the string STRING (with length LENGTH) for the pattern + compiled into BUFFER. Start searching at position START, for RANGE + characters. Return the starting position of the match, -1 for no + match, or -2 for an internal error. Also return register + information in REGS (if REGS and BUFFER->no_sub are nonzero). */ +extern int re_search + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, + int length, int start, int range, struct re_registers *regs)); + + +/* Like `re_search', but search in the concatenation of STRING1 and + STRING2. Also, stop searching at index START + STOP. */ +extern int re_search_2 + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, + int length1, const char *string2, int length2, + int start, int range, struct re_registers *regs, int stop)); + + +/* Like `re_search', but return how many characters in STRING the regexp + in BUFFER matched, starting at position START. */ +extern int re_match + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, + int length, int start, struct re_registers *regs)); + + +/* Relates to `re_match' as `re_search_2' relates to `re_search'. */ +extern int re_match_2 + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, + int length1, const char *string2, int length2, + int start, struct re_registers *regs, int stop)); + + +/* Set REGS to hold NUM_REGS registers, storing them in STARTS and + ENDS. Subsequent matches using BUFFER and REGS will use this memory + for recording register information. STARTS and ENDS must be + allocated with malloc, and must each be at least `NUM_REGS * sizeof + (regoff_t)' bytes long. + + If NUM_REGS == 0, then subsequent matches should allocate their own + register data. + + Unless this function is called, the first search or match using + PATTERN_BUFFER will allocate its own register data, without + freeing the old data. */ +extern void re_set_registers + _RE_ARGS ((struct re_pattern_buffer *buffer, struct re_registers *regs, + unsigned num_regs, regoff_t *starts, regoff_t *ends)); + +/* 4.2 bsd compatibility. */ +extern char *re_comp _RE_ARGS ((const char *)); +extern int re_exec _RE_ARGS ((const char *)); + +/* POSIX compatibility. */ +extern int regcomp _RE_ARGS ((regex_t *preg, const char *pattern, int cflags)); +extern int regexec + _RE_ARGS ((const regex_t *preg, const char *string, size_t nmatch, + regmatch_t pmatch[], int eflags)); +extern size_t regerror + _RE_ARGS ((int errcode, const regex_t *preg, char *errbuf, + size_t errbuf_size)); +extern void regfree _RE_ARGS ((regex_t *preg)); + +#endif /* not __REGEXP_LIBRARY_H__ */ + +/* +Local variables: +make-backup-files: t +version-control: t +trim-versions-without-asking: nil +End: +*/ diff --git a/old/textutils/ChangeLog b/old/textutils/ChangeLog new file mode 100644 index 0000000..71d300d --- /dev/null +++ b/old/textutils/ChangeLog @@ -0,0 +1,855 @@ +Sat Nov 7 00:26:14 1992 David J. MacKenzie (djm@goldman.gnu.ai.mit.edu) + + * wc.c (wc): If doing only -c, use st_size for regular files. + + * fold.c (fold_file): Was folding 1 column too early. + From Eric Backus . + + * memset.c: New file. + +Fri Nov 6 20:14:51 1992 David J. MacKenzie (djm@goldman.gnu.ai.mit.edu) + + * cksum.c: New file. + +Tue Oct 13 16:24:06 1992 David J. MacKenzie (djm@goldman.gnu.ai.mit.edu) + + * tac.c (tac_stdin): Handle SIGPIPE. + * sort.c (main): Handle SIGTERM. + + * od.c: New file. + + * system.h [USG || STDC_HEADERS]: Define bcmp. + +Sat Oct 3 20:41:24 1992 David J. MacKenzie (djm@goldman.gnu.ai.mit.edu) + + * sort.c (main): Handle SIGPIPE. From trq@dionysos.thphys.ox.ac.uk. + +Tue Sep 29 01:10:05 1992 David J. MacKenzie (djm@nutrimat.gnu.ai.mit.edu) + + * paste.c (main): Don't write on a string constant. + +Mon Aug 24 00:02:45 1992 Jim Meyering (meyering@churchy.gnu.ai.mit.edu) + + * tr.c: Minor cleanup. Replaced some assert(0) with abort(). + +Tue Jul 7 02:14:19 1992 David J. MacKenzie (djm@nutrimat.gnu.ai.mit.edu) + + * cmp.c, cmp.1: Move to diff distribution. + +Fri Jul 3 16:37:59 1992 David J. MacKenzie (djm@nutrimat.gnu.ai.mit.edu) + + * system.h: Change FOO_MISSING to HAVE_FOO. + +Wed May 13 20:05:41 1992 David J. MacKenzie (djm@churchy.gnu.ai.mit.edu) + + * pr.c (COLUMN): Add structure member to remember filename. + (main, init_fps, open_file, close_file): Use it. + + (close_file): Don't decrement cols_ready_to_print when closing + a file. From cdl@mpl.UCSD.EDU (Carl Lowenstein). + +Mon May 11 19:17:33 1992 David J. MacKenzie (djm@churchy.gnu.ai.mit.edu) + + * cmp.c: --show-chars -> --print-chars. + + * pr.c: Rename some variables. + +Sat May 9 18:39:47 1992 David J. MacKenzie (djm@wookumz.gnu.ai.mit.edu) + + * system.h: Define DEV_BSIZE if not defined. + +Wed Apr 22 02:15:09 1992 David J. MacKenzie (djm@churchy.gnu.ai.mit.edu) + + * system.h, tac.c: SIGTYPE -> RETSIGTYPE. + +Fri Apr 17 10:42:23 1992 David J. MacKenzie (djm@wookumz.gnu.ai.mit.edu) + + * sort.c (main): Don't stop processing args when we hit "-"; + treat it like a regular filename. + From ian@airs.com (Ian Lance Taylor). + + * pr.c (print_page): Fix off by one line count when ^L is in input. + From Andreas Schwab (schwab@ls5.informatik.uni-dortmund.de). + +Mon Apr 6 20:52:29 1992 Jim Meyering (meyering@churchy.gnu.ai.mit.edu) + + * tr.c (validate): Change error message so it doesn't mention + actual name of --truncate-set1 option. From David MacKenzie. + +Sun Apr 5 14:22:42 1992 Jim Meyering (meyering@hal.gnu.ai.mit.edu) + + * tr.c (string2_extend, validate): Give an error message when + translating without --truncate-set1, with empty string2, and + with non-empty string1. "tr 1 ''" produced a failed assertion. + +Mon Mar 30 02:20:56 1992 David J. MacKenzie (djm@wookumz.gnu.ai.mit.edu) + + * system.h: Change how ST_BLKSIZE is calculated to allow for + non-POSIX systems that don't define BSIZE in sys/param.h. + +Sat Mar 28 11:18:01 1992 David J. MacKenzie (djm@wookumz.gnu.ai.mit.edu) + + * sum.c (main, bsd_sum_file): Don't print filename if BSD + algorithm is used and only one file was given. + +Wed Mar 25 11:34:41 1992 Jim Meyering (meyering@wombat.gnu.ai.mit.edu) + + * tr.c (get_spec_stats): Fix assertion to allow ranges like a-a + with starting character equal to ending character. This is + contrary to the POSIX spec, but what is already implemented + in find_closing_delim. + +Mon Mar 16 00:15:11 1992 David J. MacKenzie (djm@wookumz.gnu.ai.mit.edu) + + * Version 1.3. + + * sort.c (numcompare, checkfp): Add parens to placate gcc2. + + * sort.c (mergefps): For -u, output the first, not last, of + the lines that compare equal. From Mike Haertel. + +Tue Mar 10 10:51:38 1992 David J. MacKenzie (djm@nutrimat.gnu.ai.mit.edu) + + * tr.c: Remove initial capitals and periods from error messages. + +Sun Mar 8 22:03:45 1992 David J. MacKenzie (djm@nutrimat.gnu.ai.mit.edu) + + * sum.c (main): Add -r option for SYSV compat. + +Thu Feb 27 22:26:25 1992 David J. MacKenzie (djm@wookumz.gnu.ai.mit.edu) + + * sort.c (compare): If -s given, leave lines in their original order. + (main): Recognize -s. + (usage): Document -s. + From Mike Haertel. + +Tue Feb 18 20:29:45 1992 Randall Smith (randy at geech.gnu.ai.mit.edu) + + * sort.c (sort): Check for complete parsing of buffer into + lines before nixing temp files. + +Mon Feb 17 10:35:58 1992 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * sum.c (sysv_sum_file): Use %lu instead of %u to print a + long. Not that it matters for GNU . . . + + * tr.c (unquote, make_printable_str): Use \007 instead of ANSI \a. + (append_normal_char, append_range, append_char_class, + append_repeated_char, append_equiv_class, spec_init): + Initialize `next' field of new `struct List_element'. + From rommel@informatik.tu-muenchen.de (Kai-Uwe Rommel). + +Sat Feb 8 17:16:49 1992 David J. MacKenzie (djm at apple-gunkies.gnu.ai.mit.edu) + + * join.c (get_line): Renamed from getline to avoid GNU libc conflict. + +Sun Feb 2 21:22:01 1992 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * Version 1.2. + + * nl.c: Support multiple files and "-" for stdin. + (main): Check for read and write errors. + (nl_file): New function. + +Wed Jan 29 10:09:10 1992 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * tr.c (main): -t option was called -b in getopt spec. + (validate): Don't warn that set1 is longer than set2. + + * tr.c: Rename --sysv-string2-truncate to --truncate-string1. + +Fri Jan 17 16:29:05 1992 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * nl.c: New program from bin-src. + + * nl.c (main): Use a struct linebuffer for line_buf. + (process_file): Use readline instead of fgets, to preserve NULs. + (check_section): Use memcmp instead of strncmp. + (proc_text): Print line_buf with fwrite instead of printf. + + * nl.c (main): Usage message if too many args given. Check + for error in closing input file. Lengths of section delimiter + strings were 1 too large. Take separator_str into account in + length of print_no_line_fmt. + (build_print_fmt): Allocate space for print_fmt, in case + separator_str is long. + (proc_text): A blank line is one that contains nothing, not + even nonprinting characters. + +Fri Jan 17 01:04:22 1992 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * All programs: Document `--' instead of `+' to introduce + long-named options, in usage messages. + + * sum.c (bsd_sum_file): Renamed from sum_file. + (sysv_sum_file): New function. + (main): Recognize an option to select between the algorithms. + +Sun Jan 5 17:41:18 1992 Jim Meyering (meyering at churchy.gnu.ai.mit.edu) + + * pr.c (close_file, print_page): Fixed bug that had the command + yes |head |pr -t printing "yyyyyyyyyy". + * (print_page): Fixed bug that had pr -3 -a printing two too few + trailer lines per page. + * (main): Added restriction that -a and -m are incompatible. + Although the POSIX spec doesn't explicitly say they shouldn't + be used together, it says -a modifies the -column option and + that -column shouldn't be used with -m. + +Thu Jan 2 15:23:59 1992 David J. MacKenzie (djm at albert.gnu.ai.mit.edu) + + * nl.c: Include regex.h after, not before, sys/types.h. + +Thu Jan 2 12:18:10 1992 Tom Lord (lord at geech.gnu.ai.mit.edu) + + * sort.c (fillbuf) return bytes buffered instead of bytes read. + +Fri Dec 27 22:53:36 1991 Jim Kingdon (kingdon at geech.gnu.ai.mit.edu) + + * sort.c (LINEALLOC): New #define. + (struct lines): New field ``limit''. + (initlines): Set it from new arg ``limit''. + (sort, mergefps, checkfp): Pass new arg to initlines(). + (findlines): Don't realloc past lines->limit. + +Tue Dec 24 01:24:03 1991 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * tac.c, sort.c, csplit.c, system.h: Change POSIX ifdefs to + HAVE_UNISTD_H and _POSIX_VERSION. + + * xwrite.c: Change POSIX ifdef to HAVE_UNISTD_H. + +Sat 14 Dec 1991 11:46:42 Jim Meyering (meyering at wombat) + + * tr.c: Fixed an inaccurate comment on posix_pedantic. + +Thu 12 Dec 1991 21:15:20 Jim Meyering (meyering at hal) + + * tr.c: Changed underscores to hyphens in long option name + "sysv_string2_truncate". + +Wed Dec 11 13:33:34 1991 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * tac.c (main): Set obscure_syntax to tell re_search to + allocate memory for the group registers. + +Fri Dec 6 18:26:27 1991 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * tac.c, sort.c, csplit.c [POSIX]: Use sigaction instead of + signal, which POSIX doesn't have. + * sort.c: Replace inthandler and huphandler with sighandler. + * csplit.c (main): Only handle signals if they were not being + ignored. + + * tr.c: POSIX_ME_HARDER -> POSIXLY_CORRECT. + +Wed Dec 4 00:47:47 1991 Jim Meyering (meyering at wombat) + + * tr.c (unquote): Reformat code so it doesn't go beyond column 80. + * tr.c (squeeze_filter): Comment a little on why it's better + to step through the input by two. + * tr.c (set_initialize): Write a comment describing the function. + * tr.c: Eliminated the variable `portability_warnings' and replaced + references to it by references to `!posix_pedantic'. One of the + uses of portability_warnings had been wrong. + +Tue Dec 3 14:03:35 1991 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * tr.c: New program. + +Sun Dec 1 15:07:35 1991 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * linebuffer.[ch] (freebuffer): New function (used by cron). + +Thu Oct 17 22:30:22 1991 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * system.h, configure, Makefile.in: Don't include memory.h if + STDC_HEADERS, removing need for MEMORY_H_MISSING. + +Thu 17 Oct 1991 16:53:07 Jim Meyering (meyering at wombat) + + * pr.c (print_page): REALLY fixed `extra newline at EOF' problem. + Somehow, part of my patch didn't make it last time. + +Sat Oct 12 12:04:47 1991 David J. MacKenzie (djm at churchy.gnu.ai.mit.edu) + + * tail.c (pipe_lines, pipe_bytes): Initialize `first->next'. + + * cmp.c (cmp): Print EOF message to stderr, not stdout, for + POSIX 1003.2.11.2. + + * sort.c (xfwrite): fwrite never returns < 0, so check if + number written is number we asked to write. + (fillbuf, main): fread never returns < 0, so check ferror instead. + From Rainer Orth. + +Tue Oct 8 18:07:08 1991 Jim Meyering (meyering at churchy) + + * pr.c (print_page): Really fixed `extra newline at EOF' problem. + * (store_columns): Fixed bug that caused `pr -b -2' to coredump + on files of certain lengths. + +Fri Oct 4 22:30:25 1991 Jim Meyering (meyering at churchy) + + * pr.c (print_page): Fixed to not add single spurious newline + at EOF when using -t. + +Wed Oct 2 01:02:05 1991 David J. MacKenzie (djm at apple-gunkies) + + * pr.c (print_page): Don't pad the page if -t given. + + * csplit.c (load_buffer), sort.c (mergefps): Use bcopy, not memcpy. + +Thu Sep 26 12:35:00 1991 David J. MacKenzie (djm at churchy.gnu.ai.mit.edu) + + * Version 1.1. + + * configure, system.h: Include memory.h if it works. + + * split.c: Allow `b' unit as well as `k' and `m'. + + * head.c, tail.c: Replace -b +blocks option with specifying + units (b, k, or m) after the number. + (parse_unit): New function. + + * fold.c (main): Check that -w arg is a number. + + * cut.c: +delimiter takes an arg. + +Mon Sep 16 14:52:38 1991 David J. MacKenzie (djm at churchy.gnu.ai.mit.edu) + + * pr.c (close_file): Don't close an already closed file. + +Thu Sep 12 00:14:43 1991 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * memchr.c: New file. + * configure: Check if it's needed. + + * csplit.c, gcsplit.1: New program. + + * pr.c (cleanup): Only free buffers if they were allocated. + + * sort.c [!USG && !STDC_HEADERS]: Declare memchr. + +Wed Sep 11 20:54:16 1991 Jim Meyering (meyering at churchy) + + * pr.c: The following 3 bugs appeared (at least) when printing + a single file with the options `-3 -f'. + * (print_white_space): Single spaces were being replaced + with tabs. + * (print_page): Some lines were getting too much white space + at the beginning because spaces_not_printed wasn't being reset + to 0. + * (read_line): The single space between a truncated column + on its left and the column on its right was omitted. Fixed + so that previous value of input_position is restored before + returning FALSE. + +Sat Sep 7 03:22:18 1991 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * configure: Only remove /etc from PATH when it's not part of + a larger name. + +Wed Sep 4 17:09:24 1991 David J. MacKenzie (djm at apple-gunkies) + + * linebuffer.c (readline): Fix incorrect recalculation of `end'. + + * head.c, tail.c: Replace `mode' variables and bitmasks with + separate variables for each option. + +Mon Sep 2 04:00:37 1991 David J. MacKenzie (djm at apple-gunkies) + + * wc.c: New program. + +Sun Sep 1 01:18:38 1991 David J. MacKenzie (djm at apple-gunkies) + + * fold.c (fold_file): Read in an int, not a char, for EOF + comparison. + + * configure: Check whether st_blksize is missing. + + * tac.c (save_stdin): Put copy of pipe input in TMPDIR if + defined, instead of /tmp. + +Thu Aug 29 14:48:15 1991 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * xwrite.c [POSIX]: unistd.h might require sys/types.h. + +Wed Aug 28 11:57:39 1991 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * paste.c (main): Consider -d "" to be like -d "\0", + for POSIX (if I read it right). + + * sort.c, join.c: New programs. + + * cut.c (set_field): Allow blanks as well as commas to + separate numbers in ranges. + +Sun Aug 25 19:57:40 1991 Jim Meyering (meyering at apple-gunkies) + + * pr.c: Failure to open an input file is no longer a fatal error. + A message is printed for each failed open. When printing + in parallel, each failed open results in one fewer output column. + Added POSIX -r option to suppress the message. + * pr.c: Added variables: failed_opens, ignore_failed_opens. + These changes were based in part on work by David MacKenzie. + +Sat Aug 24 15:27:39 1991 Jim Meyering (meyering at pogo) + + * pr.c: Complain if user gives both -m and -[0-9]+ options. + +Wed Aug 21 22:04:57 1991 David J. MacKenzie (djm at apple-gunkies) + + * Version 1.0. + +Mon Aug 19 00:16:51 1991 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * expand.c: Rename some variables. + (expand): Don't access value off end of array. + * unexpand.c: Rename some variables. + (unexpand): Don't access value off end of array. + Instead of copying tabs verbatim and flushing pending spaces + when one is reached, count them as the proper number of + pending spaces. Instead of changing tabs to single spaces if + the tabstop list is exhausted, print the rest of the line + unchanged (for POSIX). + +Sat Aug 17 01:49:41 1991 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * cut.c (cut_file), paste.c (paste_parallel, paste_serial): + Clear EOF and error conditions on stdin so it can be reused. + + * expand.c, unexpand.c (parse_tabstops): Allow blanks as well + as commas to separate tabstops, for POSIX. + * expand.c (expand), unexpand.c (unexpand): Don't line-buffer + the output; send it directly to stdout. + * unexpand.c (main): Make -t stupidly imply -a for POSIX. + (unexpand): If a tab stop list was given and we move past its end, + copy the rest of the line verbatim. + + * split.c (convint): New function to allow 'm' and 'k' after + byte counts. + (main): Use it. Change -c option to -b for POSIX. + +Fri Aug 9 02:47:02 1991 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * pr.c: Protect isdigit with isascii, if required. + +Tue Aug 6 21:42:25 1991 David J. MacKenzie (djm at wheat-chex) + + Most of the following is from Paul Eggert: + * cat.c (main): If stdin is read, check close at end. + * cmp.c (main): Check for stdin being closed. + Check for close errors on stdin and stdout. + (cmp): Return a value instead of exiting. + * cut.c (cut_file): New function, from code in main. + Check for read errors. + (main): Check for close errors. + * expand.c, unexpand.c (main): Check for close errors. + (next_file): Go to next file if one can't be opened. + Check for close errors. + * head.c (main), tail.c (main): If stdin was read, check for + close errors. + * head.c (head_file), tail.c (tail_file): Check for close errors. + * paste.c (main, paste_parallel, paste_serial), tac.c (main): + Check for close errors. Close stdin if it was read. + * split.c (main): Check for close errors. + + * configure, Makefile.in's: Support +srcdir option. + Make config.status. Fix up clean targets. + +Wed Jul 31 01:32:59 1991 David J. MacKenzie (djm at hal) + + * linebuffer.h (struct linebuffer): Add a field to indicate + the number of valid chars in the line. + * linebuffer.c (initbuffer, readline): Fill it in. + * uniq.c, comm.c: Use it. + + * pr.c (main): Check stdin and stdout fclose for errors. + (init_parameters): If there's no room for header and footer, + omit them rather than dying (for POSIX). + (init_header): Take a file descriptor as additional arg. + (init_fps): Change callers. Note when stdin is read. + (open_file): For filename "-" use stdin. + (close_file): Don't close stdin. Check close for errors. + (print_char, char_to_clump): Use isprint instead of explicit + comparisons. + + * memcmp.c: New file (needed for comm). + * bcopy.c: New file (needed for fold). + * system.h: Don't define bcopy as memcpy. + * configure: Check for bcopy and memcmp. + + * uniq.c (main): Use "-" instead of NULL to mean stdin or + stdout. + (check_file): Use "-" instead of NULL to mean stdin or stdout. + Check readline return instead of for NUL character to + detect eof. + Check fclose for errors. + (find_field): Use linebuffer length, not NULs, to detect end + of line. + (different): New function, replaces compare. Uses memcmp + instead of strncmp. + (writeline): Use fwrite instead of fputs so NULs are preserved. + + * comm.c (compare_files): Return an error indication. + Don't take a filename of NULL to mean stdin. + Use memcmp instead of strcmp to allow for NULs. + Check fclose for errors. + (writeline): Use fwrite instead of fputs so NULs are preserved. + + * sum.c (sum_file): Take an arg indicating whether to print + the filename, and don't take NULL meaning stdin. Set a flag + when we read stdin. Check fclose return for errors. + (main): If stdin was read, check fclose return for errors. + Use filename of "-" if no args given. + +Thu Jul 25 15:17:10 1991 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * fold.c: Rewritten from scratch for POSIX. + +Wed Jul 24 01:55:41 1991 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * split.c (line_bytes_split): Use xmalloc instead of alloca. + * system.h: Don't declare alloca. + + * tac.c, tail.c: Use SEEK_ instead of L_ for lseek. + * system.h: Define SEEK_ macros if not defined. + + * pr.c: Rename variable `truncate' to avoid library function conflict. + +Tue Jul 23 13:21:48 1991 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * linebuffer.c, linebuffer.h: New files. + * comm.c, uniq.c (initbuffer, readline): Functions + removed (use versions in linebuffer.c). + +Mon Jul 22 13:23:53 1991 David J. MacKenzie (djm at wookumz.gnu.ai.mit.edu) + + * sum.c (sumfile): Always print 5 digits for second number, too. + Interpret "-" to mean stdin. + +Sat Jul 20 14:24:40 1991 David J. MacKenzie (djm at bleen) + + * uniq.c: Use isblank instead of isspace, to support POSIX.2. + * comm.c, pr.c, uniq.c (concat, fatal, error, + pfatal_with_name, xmalloc, xrealloc): Functions removed. + +Sat Jul 13 02:04:53 1991 David J. MacKenzie (djm at geech.gnu.ai.mit.edu) + + * nl.c: Add long-named options. Doc fixes. + +Sat Jul 6 02:19:09 1991 David J. MacKenzie (djm at geech.gnu.ai.mit.edu) + + * expand.c, unexpand.c [STDC_HEADERS]: Include stdlib.h. + + * xwrite.c [POSIX]: Include unistd.h. + [STDC_HEADERS]: Don't declare errno. + +Sun Jun 30 23:35:16 1991 David J. MacKenzie (djm at geech.gnu.ai.mit.edu) + + * uniq.c: Add long-named options. Remove marginally useful -z + option (zero padded repeat counts). + +Thu Jun 27 16:31:45 1991 David J. MacKenzie (djm at geech.gnu.ai.mit.edu) + + * tail.c (tail_file), tac.c (save_stdin, tac_file), split.c + (cwrite), head.c (head_file), cat.c (main): Check close return + value for delayed error report due to NFS. + +Tue Jun 11 00:12:15 1991 David J. MacKenzie (djm at geech.gnu.ai.mit.edu) + + * cat.c: Replace "uchar" with "unsigned char", to avoid + problems with various systems' typedefs. + +Thu Jun 6 12:54:26 1991 David J. MacKenzie (djm at geech.gnu.ai.mit.edu) + + * cat.c (cat): Interpret ENOTTY return from FIONREAD ioctl to mean + operation is unsupported, for HP-UX 7.0. + +Sun Apr 14 21:49:17 1991 Richard Stallman (rms at mole.gnu.ai.mit.edu) + + * sum.c: Always print five digits for first number. + +Fri Mar 15 16:16:54 1991 David J. MacKenzie (djm at geech.ai.mit.edu) + + * cat.c, cmp.c: Don't use fileno(); not needed. + +Thu Jan 10 02:16:55 1991 David J. MacKenzie (djm at albert.ai.mit.edu) + + * tac.c, tail.c: Change _POSIX_SOURCE to POSIX. + +Thu Dec 27 00:06:45 1990 David J. MacKenzie (djm at egypt) + + * cut.c (cut_file_bytes, cut_file_fields): Make inbufp and + outbufp global. + (enlarge_line): Adjust inbufp and outbufp. + +Sun Sep 9 16:54:19 1990 David J. MacKenzie (djm at albert.ai.mit.edu) + + * cat.c: Declare free returning void, not int, so it + doesn't bomb on Xenix. + +Mon Sep 3 22:23:57 1990 David J. MacKenzie (djm at coke) + + * tac.c: Print error messages before calling cleanup, not after. + +Tue Aug 28 18:05:24 1990 David J. MacKenzie (djm at albert.ai.mit.edu) + + * tac.c (cleanup): Return SIGTYPE, not int. + +Tue Aug 7 12:51:18 1990 David J. MacKenzie (djm at apple-gunkies) + + * cut.c (main, usage): Add -b and -n options for POSIX. + (set_fields): Don't allow SPC or TAB as number separators. + + * paste.c (paste_parallel): If open of any file fails, quit + (for POSIX). + +Mon Aug 6 22:14:13 1990 David J. MacKenzie (djm at apple-gunkies) + + * pr.c: Add POSIX -F option (same as -f). + + * uniq.c (check_file): Allow '-' to mean stdin or stdout. + +Mon Aug 6 14:43:30 1990 David J. MacKenzie (djm at pogo.ai.mit.edu) + + * head.c, tail.c: Change `chars' to `bytes' globally. + (main, usage): Use POSIX.2 draft 10 option syntax. + +Sun Aug 5 11:51:12 1990 David J. MacKenzie (djm at pogo.ai.mit.edu) + + * cat.c (main): Don't delay error messages, so they appear + where expected. + (main, simple_cat, cat): Make errors in input files nonfatal. + +Sat Aug 4 10:11:30 1990 David J. MacKenzie (djm at pogo.ai.mit.edu) + + * cat.c: Remove -c option added for POSIX draft 9, since POSIX + draft 10 removed it. + + * tac.c (tac_stdin): Use fstat instead of lseek to determine + whether stdin is seekable, because lseek silently fails on + some special files, like tty's. + tail.c (tail_chars, tail_lines): Use fstat instead of lseek; + don't turn off -f for non-regular files (assume the user knows + what he's doing; it might work for fifo's and sockets). + + * paste.c (main): If no files given, use stdin. + Don't let collapse_escapes write on string constant (delim default). + (paste_parallel): Don't close stdin. + + * cut.c (main): Use standard input for filename of "-". + + * comm.c (compare_files): Allow '-' to mean stdin. + +Fri Aug 3 13:38:28 1990 David J. MacKenzie (djm at pogo.ai.mit.edu) + + * cut.c (enlarge_line): Take an arg giving the required amount + of space. Change callers. + (main): Don't allow -t'' without -f. + Make `delim' unsigned to fix sign extension problem in comparison. + +Tue Jul 17 12:36:11 EDT 1990 Jay Fenlason (hack@ai.mit.edu) + + * pr.c Deleted excess whitespace from ends of lines. + Modified to work with current version of getopt, which + returns 1 instead of 0 for non-options. + Reversed the meaning of the -f option, to be compatable + with real pr. + +Sun Jul 8 00:39:31 1990 David J. MacKenzie (djm at apple-gunkies) + + * cmp.c (main, usage): Rename -L option to -c and don't have + it imply -l. + (printc): Take an arg to specify number of chars to pad to, + for column alignment. + (cmp): Respect flag_print_chars in default output format. + Align columns for cmp -cl. + +Sat Jul 7 17:23:30 1990 David J. MacKenzie (djm at apple-gunkies) + + * cmp.c: For +show-chars, have getopt return 'L' so + `flag_print_chars' gets set. + +Fri Jun 29 01:04:19 1990 David J. MacKenzie (djm at apple-gunkies) + + * tac.c (main): Initialize fastmap and translate fields of + regex before compiling it. + +Fri Jun 22 00:38:20 1990 David J. MacKenzie (djm at albert.ai.mit.edu) + + * tac.c: Change +regexp to +regex for consistency with GNU find. + +Wed Jun 20 01:46:09 1990 David J. MacKenzie (djm at albert.ai.mit.edu) + + * cat.c (cat): If FIONREAD is available, only use it if it is + supported by the filesystem that the file is on. + +Sun Jun 3 20:26:19 1990 David J. MacKenzie (djm at albert.ai.mit.edu) + + * cat.c (main): Add a variable to control whether the check + for input file == output file is made, because no values of + st_dev and st_ino should be assumed to be available for this + purpose. Only do the check for regular files. + + * tac.c: Use bcopy instead of memcpy. + +Thu May 31 00:55:36 1990 David J. MacKenzie (djm at apple-gunkies) + + * head.c: Use longs instead of ints for file offsets, for 16 + bit machines. + +Tue May 22 00:56:51 1990 David J. MacKenzie (djm at albert.ai.mit.edu) + + * cmp.c: Change some ints to longs for 16 bit machines. + (bcmp_cnt): Make char-pointer counting slightly simpler. + +Sat May 12 01:16:42 1990 David J. MacKenzie (djm at albert.ai.mit.edu) + + * cat.c (main): Allow input file to be output file for devices + (ttys, etc.). Check whether input file is output file when + reading standard input. Print any error messages for standard + input. + + * cmp.c (bcmp_cnt): Handle int comparisons correctly on 16 bit + machines as well as 32 bit ones. + * cmp.c, tail.c: Use longs instead of ints for file offsets. + +Fri May 11 02:11:03 1990 David J. MacKenzie (djm at albert.ai.mit.edu) + + * cmp.c: Fix some exit statuses for POSIX. + +Tue May 8 03:41:42 1990 David J. MacKenzie (djm at abyss) + + * tac.c: Use regular expressions as the record boundaries. + Give better error messages. + Reformat code and make it more readable. + (main): Use getopt_long to parse options. + (tac_stdin): Do not make a temporary file if standard input + is a file. + (tac_file): New function. + (tac): Take an open file desc as an arg. + (output): Rewrite to use its own efficient buffering. + (xmalloc, xrealloc, xwrite): New functions. + +Sun Apr 8 20:33:20 1990 David J. MacKenzie (djm at albert.ai.mit.edu) + + * head.c, tail.c: Use `error' instead of `fatal_perror' and + `nonfatal_perror'. Remove some unnecessary info from messages. + +Wed Mar 21 09:30:18 1990 David J. MacKenzie (djm at pogo.ai.mit.edu) + + * comm.c (main): Pass the list of files to compare_files as a + char ** instead of a char *. + (compare_files): Make arg a char **. + + * uniq.c: Declare some functions as void. + Change global vars `countmode' and `mode' from ints to enums. + (main): Use getopt to parse options and support POSIX options. + Don't use integer_arg to parse numbers, since `-#' can't be + parsed that way using getopt. + (find_field): Use isspace for finding fields boundaries. + +Tue Mar 20 14:28:25 1990 David J. MacKenzie (djm at pogo.ai.mit.edu) + + * comm.c (main): Call usage if given bad option or wrong + number of args. Exit with 0 status normally. + (usage): New function. + Declare some other functions as void. + +Wed Mar 14 10:48:40 1990 David J. MacKenzie (djm at rice-chex) + + * cmp.c (main, cmp, usage): Replace -q +quick option with -L + +show-chars option to add ASCII representation of bytes to -l format. + +Tue Mar 13 00:50:14 1990 David J. MacKenzie (djm at rice-chex) + + * cmp.c (cmp): Change EOF message for POSIX compatibility. + For -l format, clear bits > FF. + +Mon Mar 5 17:21:00 1990 David J. MacKenzie (djm at albert.ai.mit.edu) + + * tail.c: Move global `errors' into main instead of having + nonfatal_perror set it. + (tail, tail_chars, tail_file, tail_lines, pipe_chars, pipe_lines): + Return an error status. + (file_lines, start_chars, start_lines): Reverse the meaning of + the return value. + (tail_lines, tail_chars): Account for that reversal. + +Mon Mar 5 00:34:36 1990 David J. MacKenzie (djm at albert.ai.mit.edu) + + * head.c: Move global `errors' into main and have the various + functions return an error status instead of setting it in + nonfatal_perror. + +Sat Mar 3 11:27:27 1990 Torbj|rn Granlund (tege at echnaton) + + * cmp.c (cmp): Call function bcmp_cnt for flag == 0 (i.e. no + options specified), to compare the two blocks and count + newlines simultaneously. + * cmp.c New function: bcmp_cnt. + + * cmp.c (main): Test if output is redirected to /dev/null, and + assume `-s' if this is so. + +Tue Feb 20 17:09:19 1990 David J. MacKenzie (djm at albert.ai.mit.edu) + + * cat.c: Change `argbad' from a char to a short, so it will + work on machines with unsigned chars. + +Sat Feb 10 02:16:40 1990 David J. MacKenzie (djm at albert.ai.mit.edu) + + * cmp.c (cmp): Rename `r' to `first_diff', and `x' to `smaller'. + Remove unneccessary variable `c1'. If -l was given, increase + `char_number' by the number of bytes read, after producing output, + rather than by the offset of the first differing bytes, before + producing output. + Replace if-else-if constructions with case statements for clarity. + (bcmp2): Rename `n' to `nread'. + +Wed Dec 20 01:32:06 1989 David J. MacKenzie (djm at hobbes.ai.mit.edu) + + * nl.c (proc_text): Use re_search instead of re_match. + +Tue Dec 19 01:26:34 1989 David J. MacKenzie (djm at hobbes.ai.mit.edu) + + * nl.c: Indent. Un-nest statements. Use GNU regexp functions + instead of System V ones. Move function declarations together. + (quit): Remove useless function. + (program_name): New variable for error messages. + (main): Use perror in error message. + (xmalloc): New function to replace myalloc. + (myalloc): Function removed. + Global: use program_name and xmalloc. + +Sun Dec 17 00:36:36 1989 David J. MacKenzie (djm at hobbes.ai.mit.edu) + + * uniq.c: Declare some functions. + (main): Initialize infile and outfile. Call usage if given + invalid args. Normally exit with 0 status instead of garbage. + (usage): New function to print usage message and exit. + (check_file): Remove unused variable. + (readline): Compare against EOF, not < 0. + (xmalloc, xrealloc): Return char *, not int. + Ok to return 0 if 0 bytes requested. + (lb1, lb2): Remove unused global vars. + (concat): Remove unused function. + +Sat Dec 16 15:15:50 1989 David J. MacKenzie (djm at hobbes.ai.mit.edu) + + * comm.c: Remove unused global variables lb1, lb2. + (main): Remove unneeded variable. + (compare_files): Remove unused arg. + (readline): un-nest assignment. Test against EOF instead of < 0. + (error): Print to stderr, not stdout. + (xmalloc, xrealloc): Return char * instead of int. + Returning 0 is ok if 0 bytes requested (ANSI C). + + +Local Variables: +mode: indented-text +left-margin: 8 +version-control: never +End: diff --git a/src/cat.c b/src/cat.c new file mode 100644 index 0000000..34c4384 --- /dev/null +++ b/src/cat.c @@ -0,0 +1,660 @@ +/* cat -- concatenate files and print on the standard output. + Copyright (C) 1988, 1990, 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* Differences from the Unix cat: + * Always unbuffered, -u is ignored. + * 100 times faster with -v -u. + * 20 times faster with -v. + + By tege@sics.se, Torbjorn Granlund, advised by rms, Richard Stallman. */ + +#include +#include +#include +#ifndef _POSIX_SOURCE +#include +#endif +#include "system.h" + +#define max(h,i) ((h) > (i) ? (h) : (i)) + +char *stpcpy (); +char *xmalloc (); +void cat (); +void error (); +void next_line_num (); +void simple_cat (); + +/* Name under which this program was invoked. */ +char *program_name; + +/* Name of input file. May be "-". */ +char *infile; + +/* Descriptor on which input file is open. */ +int input_desc; + +/* Descriptor on which output file is open. Always is 1. */ +int output_desc; + +/* Buffer for line numbers. */ +char line_buf[13] = +{' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '0', '\t', '\0'}; + +/* Position in `line_buf' where printing starts. This will not change + unless the number of lines are more than 999999. */ +char *line_num_print = line_buf + 5; + +/* Position of the first digit in `line_buf'. */ +char *line_num_start = line_buf + 10; + +/* Position of the last digit in `line_buf'. */ +char *line_num_end = line_buf + 10; + +/* Preserves the `cat' function's local `newlines' between invocations. */ +int newlines2 = 0; + +/* Count of non-fatal error conditions. */ +int exit_stat = 0; + +void +usage (reason) + char *reason; +{ + if (reason != NULL) + fprintf (stderr, "%s: %s\n", program_name, reason); + + fprintf (stderr, "\ +Usage: %s [-benstuvAET] [--number] [--number-nonblank] [--squeeze-blank]\n\ + [--show-nonprinting] [--show-ends] [--show-tabs] [--show-all]\n\ + [file...]\n", + program_name); + + exit (2); +} + + +void +main (argc, argv) + int argc; + char *argv[]; +{ + /* Optimal size of i/o operations of output. */ + int outsize; + + /* Optimal size of i/o operations of input. */ + int insize; + + /* Pointer to the input buffer. */ + unsigned char *inbuf; + + /* Pointer to the output buffer. */ + unsigned char *outbuf; + + int c; + + /* Index in argv to processed argument. */ + int argind; + + /* Device number of the output (file or whatever). */ + int out_dev; + + /* I-node number of the output. */ + int out_ino; + + /* Nonzero if the output file should not be the same as any input file. */ + int check_redirection = 1; + + /* Nonzero if we have ever read standard input. */ + int have_read_stdin = 0; + + struct stat stat_buf; + + /* Variables that are set according to the specified options. */ + int numbers = 0; + int numbers_at_empty_lines = 1; + int squeeze_empty_lines = 0; + int mark_line_ends = 0; + int quote = 0; + int output_tabs = 1; + int options = 0; + + static struct option long_options[] = + { + {"number-nonblank", 0, NULL, 'b'}, + {"number", 0, NULL, 'n'}, + {"squeeze-blank", 0, NULL, 's'}, + {"show-nonprinting", 0, NULL, 'v'}, + {"show-ends", 0, NULL, 'E'}, + {"show-tabs", 0, NULL, 'T'}, + {"show-all", 0, NULL, 'A'}, + {NULL, 0, NULL, 0} + }; + + program_name = argv[0]; + + /* Parse command line options. */ + + while ((c = getopt_long (argc, argv, "benstuvAET", long_options, (int *) 0)) + != EOF) + { + options++; + switch (c) + { + case 'b': + numbers = 1; + numbers_at_empty_lines = 0; + break; + + case 'e': + mark_line_ends = 1; + quote = 1; + break; + + case 'n': + numbers = 1; + break; + + case 's': + squeeze_empty_lines = 1; + break; + + case 't': + output_tabs = 0; + quote = 1; + break; + + case 'u': + /* We provide the -u feature unconditionally. */ + options--; + break; + + case 'v': + quote = 1; + break; + + case 'A': + quote = 1; + mark_line_ends = 1; + output_tabs = 0; + break; + + case 'E': + mark_line_ends = 1; + break; + + case 'T': + output_tabs = 0; + break; + + default: + usage ((char *) 0); + } + } + + output_desc = 1; + + /* Get device, i-node number, and optimal blocksize of output. */ + + if (fstat (output_desc, &stat_buf) < 0) + error (1, errno, "standard output"); + + outsize = ST_BLKSIZE (stat_buf); + /* Input file can be output file for non-regular files. + fstat on pipes returns S_IFSOCK on some systems, S_IFIFO + on others, so the checking should not be done for those types, + and to allow things like cat < /dev/tty > /dev/tty, checking + is not done for device files either. */ + + if (S_ISREG (stat_buf.st_mode)) + { + out_dev = stat_buf.st_dev; + out_ino = stat_buf.st_ino; + } + else + check_redirection = 0; + + /* Check if any of the input files are the same as the output file. */ + + /* Main loop. */ + + infile = "-"; + argind = optind; + + do + { + if (argind < argc) + infile = argv[argind]; + + if (infile[0] == '-' && infile[1] == 0) + { + have_read_stdin = 1; + input_desc = 0; + } + else + { + input_desc = open (infile, O_RDONLY); + if (input_desc < 0) + { + error (0, errno, "%s", infile); + exit_stat = 1; + continue; + } + } + + if (fstat (input_desc, &stat_buf) < 0) + { + error (0, errno, "%s", infile); + exit_stat = 1; + goto contin; + } + insize = ST_BLKSIZE (stat_buf); + + /* Compare the device and i-node numbers of this input file with + the corresponding values of the (output file associated with) + stdout, and skip this input file if they coincide. Input + files cannot be redirected to themselves. */ + + if (check_redirection + && stat_buf.st_dev == out_dev && stat_buf.st_ino == out_ino) + { + error (0, 0, "%s: input file is output file", infile); + exit_stat = 1; + goto contin; + } + + /* Select which version of `cat' to use. If any options (more than -u) + were specified, use `cat', otherwise use `simple_cat'. */ + + if (options == 0) + { + insize = max (insize, outsize); + inbuf = (unsigned char *) xmalloc (insize); + + simple_cat (inbuf, insize); + } + else + { + inbuf = (unsigned char *) xmalloc (insize + 1); + + /* Why are (OUTSIZE - 1 + INSIZE * 4 + 13) bytes allocated for + the output buffer? + + A test whether output needs to be written is done when the input + buffer empties or when a newline appears in the input. After + output is written, at most (OUTSIZE - 1) bytes will remain in the + buffer. Now INSIZE bytes of input is read. Each input character + may grow by a factor of 4 (by the prepending of M-^). If all + characters do, and no newlines appear in this block of input, we + will have at most (OUTSIZE - 1 + INSIZE) bytes in the buffer. If + the last character in the preceeding block of input was a + newline, a line number may be written (according to the given + options) as the first thing in the output buffer. (Done after the + new input is read, but before processing of the input begins.) A + line number requires seldom more than 13 positions. */ + + outbuf = (unsigned char *) xmalloc (outsize - 1 + insize * 4 + 13); + + cat (inbuf, insize, outbuf, outsize, quote, + output_tabs, numbers, numbers_at_empty_lines, mark_line_ends, + squeeze_empty_lines); + + free (outbuf); + } + + free (inbuf); + + contin: + if (strcmp (infile, "-") && close (input_desc) < 0) + { + error (0, errno, "%s", infile); + exit_stat = 1; + } + } + while (++argind < argc); + + if (have_read_stdin && close (0) < 0) + error (1, errno, "-"); + if (close (1) < 0) + error (1, errno, "write error"); + + exit (exit_stat); +} + +/* Plain cat. Copies the file behind `input_desc' to the file behind + `output_desc'. */ + +void +simple_cat (buf, bufsize) + /* Pointer to the buffer, used by reads and writes. */ + unsigned char *buf; + + /* Number of characters preferably read or written by each read and write + call. */ + int bufsize; +{ + /* Actual number of characters read, and therefore written. */ + int n_read; + + /* Loop until the end of the file. */ + + for (;;) + { + /* Read a block of input. */ + + n_read = read (input_desc, buf, bufsize); + if (n_read < 0) + { + error (0, errno, "%s", infile); + exit_stat = 1; + return; + } + + /* End of this file? */ + + if (n_read == 0) + break; + + /* Write this block out. */ + + if (write (output_desc, buf, n_read) != n_read) + error (1, errno, "write error"); + } +} + +/* Cat the file behind INPUT_DESC to the file behind OUTPUT_DESC. + Called if any option more than -u was specified. + + A newline character is always put at the end of the buffer, to make + an explicit test for buffer end unnecessary. */ + +void +cat (inbuf, insize, outbuf, outsize, quote, + output_tabs, numbers, numbers_at_empty_lines, + mark_line_ends, squeeze_empty_lines) + + /* Pointer to the beginning of the input buffer. */ + unsigned char *inbuf; + + /* Number of characters read in each read call. */ + int insize; + + /* Pointer to the beginning of the output buffer. */ + unsigned char *outbuf; + + /* Number of characters written by each write call. */ + int outsize; + + /* Variables that have values according to the specified options. */ + int quote; + int output_tabs; + int numbers; + int numbers_at_empty_lines; + int mark_line_ends; + int squeeze_empty_lines; +{ + /* Last character read from the input buffer. */ + unsigned char ch; + + /* Pointer to the next character in the input buffer. */ + unsigned char *bpin; + + /* Pointer to the first non-valid byte in the input buffer, i.e. the + current end of the buffer. */ + unsigned char *eob; + + /* Pointer to the position where the next character shall be written. */ + unsigned char *bpout; + + /* Number of characters read by the last read call. */ + int n_read; + + /* Determines how many consequtive newlines there have been in the + input. 0 newlines makes NEWLINES -1, 1 newline makes NEWLINES 1, + etc. Initially 0 to indicate that we are at the beginning of a + new line. The "state" of the procedure is determined by + NEWLINES. */ + int newlines = newlines2; + +#ifdef FIONREAD + /* If nonzero, use the FIONREAD ioctl, as an optimization. + (On Ultrix, it is not supported on NFS filesystems.) */ + int use_fionread = 1; +#endif + + /* The inbuf pointers are initialized so that BPIN > EOB, and thereby input + is read immediately. */ + + eob = inbuf; + bpin = eob + 1; + + bpout = outbuf; + + for (;;) + { + do + { + /* Write if there are at least OUTSIZE bytes in OUTBUF. */ + + if (bpout - outbuf >= outsize) + { + unsigned char *wp = outbuf; + do + { + if (write (output_desc, wp, outsize) != outsize) + error (1, errno, "write error"); + wp += outsize; + } + while (bpout - wp >= outsize); + + /* Move the remaining bytes to the beginning of the + buffer. */ + + bcopy (wp, outbuf, bpout - wp); + bpout = outbuf + (bpout - wp); + } + + /* Is INBUF empty? */ + + if (bpin > eob) + { +#ifdef FIONREAD + int n_to_read = 0; + + /* Is there any input to read immediately? + If not, we are about to wait, + so write all buffered output before waiting. */ + + if (use_fionread + && ioctl (input_desc, FIONREAD, &n_to_read) < 0) + { + /* Ultrix returns EOPNOTSUPP on NFS; + HP-UX returns ENOTTY on pipes. */ + if (errno == EOPNOTSUPP || errno == ENOTTY) + use_fionread = 0; + else + { + error (0, errno, "cannot do ioctl on `%s'", infile); + exit_stat = 1; + newlines2 = newlines; + return; + } + } + if (n_to_read == 0) +#endif + { + int n_write = bpout - outbuf; + + if (write (output_desc, outbuf, n_write) != n_write) + error (1, errno, "write error"); + bpout = outbuf; + } + + /* Read more input into INBUF. */ + + n_read = read (input_desc, inbuf, insize); + if (n_read < 0) + { + error (0, errno, "%s", infile); + exit_stat = 1; + newlines2 = newlines; + return; + } + if (n_read == 0) + { + newlines2 = newlines; + return; + } + + /* Update the pointers and insert a sentinel at the buffer + end. */ + + bpin = inbuf; + eob = bpin + n_read; + *eob = '\n'; + } + else + { + /* It was a real (not a sentinel) newline. */ + + /* Was the last line empty? + (i.e. have two or more consecutive newlines been read?) */ + + if (++newlines > 0) + { + /* Are multiple adjacent empty lines to be substituted by + single ditto (-s), and this was the second empty line? */ + + if (squeeze_empty_lines && newlines >= 2) + { + ch = *bpin++; + continue; + } + + /* Are line numbers to be written at empty lines (-n)? */ + + if (numbers && numbers_at_empty_lines) + { + next_line_num (); + bpout = (unsigned char *) stpcpy (bpout, line_num_print); + } + } + + /* Output a currency symbol if requested (-e). */ + + if (mark_line_ends) + *bpout++ = '$'; + + /* Output the newline. */ + + *bpout++ = '\n'; + } + ch = *bpin++; + } + while (ch == '\n'); + + /* Are we at the beginning of a line, and line numbers are requested? */ + + if (newlines >= 0 && numbers) + { + next_line_num (); + bpout = (unsigned char *) stpcpy (bpout, line_num_print); + } + + /* Here CH cannot contain a newline character. */ + + /* The loops below continue until a newline character is found, + which means that the buffer is empty or that a proper newline + has been found. */ + + /* If quoting, i.e. at least one of -v, -e, or -t specified, + scan for chars that need conversion. */ + if (quote) + for (;;) + { + if (ch >= 32) + { + if (ch < 127) + *bpout++ = ch; + else if (ch == 127) + *bpout++ = '^', + *bpout++ = '?'; + else + { + *bpout++ = 'M', + *bpout++ = '-'; + if (ch >= 128 + 32) + if (ch < 128 + 127) + *bpout++ = ch - 128; + else + *bpout++ = '^', + *bpout++ = '?'; + else + *bpout++ = '^', + *bpout++ = ch - 128 + 64; + } + } + else if (ch == '\t' && output_tabs) + *bpout++ = '\t'; + else if (ch == '\n') + { + newlines = -1; + break; + } + else + *bpout++ = '^', + *bpout++ = ch + 64; + + ch = *bpin++; + } + else + /* Not quoting, neither of -v, -e, or -t specified. */ + for (;;) + { + if (ch == '\t' && !output_tabs) + *bpout++ = '^', + *bpout++ = ch + 64; + else if (ch != '\n') + *bpout++ = ch; + else + { + newlines = -1; + break; + } + + ch = *bpin++; + } + } +} + +/* Compute the next line number. */ + +void +next_line_num () +{ + char *endp = line_num_end; + do + { + if ((*endp)++ < '9') + return; + *endp-- = '0'; + } + while (endp >= line_num_start); + *--line_num_start = '1'; + if (line_num_start < line_num_print) + line_num_print--; +} diff --git a/src/cksum.c b/src/cksum.c new file mode 100644 index 0000000..df9c313 --- /dev/null +++ b/src/cksum.c @@ -0,0 +1,274 @@ +/* cksum -- calculate and print POSIX.2 checksums and sizes of files + Copyright (C) 1992 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* Written by Q. Frank Xia, qx@math.columbia.edu. + Cosmetic changes and reorganization by David MacKenzie, djm@gnu.ai.mit.edu. + + Usage: cksum [file...] + + The code segment between "#ifdef CRCTAB" and "#else" is the code + which calculates the "crctab". It is included for those who want + verify the correctness of the "crctab". To recreate the "crctab", + do following: + + cc -DCRCTAB -o crctab cksum.c + crctab > crctab.h + + As Bruce Evans pointed out to me, the crctab in the sample C code + in 4.9.10 Rationale of P1003.2/D11.2 is represented in reversed order. + Namely, 0x01 is represented as 0x80, 0x02 is represented as 0x40, etc. + The generating polynomial is crctab[0x80]=0xedb88320 instead of + crctab[1]=0x04C11DB7. But the code works only for a non-reverse order + crctab. Therefore, the sample implementation is wrong. + + This software is compatible with neither the System V nor the BSD + `sum' program. It is supposed to conform to P1003.2/D11.2, + except foreign language interface (4.9.5.3 of P1003.2/D11.2) support. + Any inconsistency with the standard except 4.9.5.3 is a bug. */ + +#ifdef CRCTAB + +#include + +#define BIT(x) ( (unsigned long)1 << (x) ) +#define SBIT BIT(31) + +/* The generating polynomial is + + 32 26 23 22 16 12 11 10 8 7 5 4 2 1 + G(X)=X + X + X + X + X + X + X + X + X + X + X + X + X + X + 1 + + The i bit in GEN is set if X^i is a summand of G(X) except X^32. */ + +#define GEN (BIT(26)|BIT(23)|BIT(22)|BIT(16)|BIT(12)|BIT(11)|BIT(10)\ + |BIT(8) |BIT(7) |BIT(5) |BIT(4) |BIT(2) |BIT(1) |BIT(0)); + +unsigned long r[8]; + +void +fill_r () +{ + int i; + + r[0] = GEN; + for (i = 1; i < 8; i++) + r[i] = (r[i - 1] & SBIT) ? (r[i - 1] << 1) ^ r[0] : r[i - 1] << 1; +} + +unsigned long +remainder (m) + int m; +{ + unsigned long rem = 0; + int i; + + for (i = 0; i < 8; i++) + if (BIT (i) & m) + rem = rem ^ r[i]; + + return rem & 0xFFFFFFFF; /* Make it run on 64-bit machine. */ +} + +void +main () +{ + int i; + + fill_r (); + printf ("unsigned long crctab[256] = {\n 0x0"); + for (i = 0; i < 51; i++) + { + printf (",\n 0x%08X, 0x%08X, 0x%08X, 0x%08X, 0x%08X", + remainder (i * 5 + 1), remainder (i * 5 + 2), remainder (i * 5 + 3), + remainder (i * 5 + 4), remainder (i * 5 + 5)); + } + printf ("\n};\n"); + exit (0); +} + +#else /* !CRCTAB */ + +#include +#include +#include "system.h" + +/* Number of bytes to read at once. */ +#define BUFLEN (1 << 16) + +unsigned long crctab[256] = +{ + 0x0, + 0x04C11DB7, 0x09823B6E, 0x0D4326D9, 0x130476DC, 0x17C56B6B, + 0x1A864DB2, 0x1E475005, 0x2608EDB8, 0x22C9F00F, 0x2F8AD6D6, + 0x2B4BCB61, 0x350C9B64, 0x31CD86D3, 0x3C8EA00A, 0x384FBDBD, + 0x4C11DB70, 0x48D0C6C7, 0x4593E01E, 0x4152FDA9, 0x5F15ADAC, + 0x5BD4B01B, 0x569796C2, 0x52568B75, 0x6A1936C8, 0x6ED82B7F, + 0x639B0DA6, 0x675A1011, 0x791D4014, 0x7DDC5DA3, 0x709F7B7A, + 0x745E66CD, 0x9823B6E0, 0x9CE2AB57, 0x91A18D8E, 0x95609039, + 0x8B27C03C, 0x8FE6DD8B, 0x82A5FB52, 0x8664E6E5, 0xBE2B5B58, + 0xBAEA46EF, 0xB7A96036, 0xB3687D81, 0xAD2F2D84, 0xA9EE3033, + 0xA4AD16EA, 0xA06C0B5D, 0xD4326D90, 0xD0F37027, 0xDDB056FE, + 0xD9714B49, 0xC7361B4C, 0xC3F706FB, 0xCEB42022, 0xCA753D95, + 0xF23A8028, 0xF6FB9D9F, 0xFBB8BB46, 0xFF79A6F1, 0xE13EF6F4, + 0xE5FFEB43, 0xE8BCCD9A, 0xEC7DD02D, 0x34867077, 0x30476DC0, + 0x3D044B19, 0x39C556AE, 0x278206AB, 0x23431B1C, 0x2E003DC5, + 0x2AC12072, 0x128E9DCF, 0x164F8078, 0x1B0CA6A1, 0x1FCDBB16, + 0x018AEB13, 0x054BF6A4, 0x0808D07D, 0x0CC9CDCA, 0x7897AB07, + 0x7C56B6B0, 0x71159069, 0x75D48DDE, 0x6B93DDDB, 0x6F52C06C, + 0x6211E6B5, 0x66D0FB02, 0x5E9F46BF, 0x5A5E5B08, 0x571D7DD1, + 0x53DC6066, 0x4D9B3063, 0x495A2DD4, 0x44190B0D, 0x40D816BA, + 0xACA5C697, 0xA864DB20, 0xA527FDF9, 0xA1E6E04E, 0xBFA1B04B, + 0xBB60ADFC, 0xB6238B25, 0xB2E29692, 0x8AAD2B2F, 0x8E6C3698, + 0x832F1041, 0x87EE0DF6, 0x99A95DF3, 0x9D684044, 0x902B669D, + 0x94EA7B2A, 0xE0B41DE7, 0xE4750050, 0xE9362689, 0xEDF73B3E, + 0xF3B06B3B, 0xF771768C, 0xFA325055, 0xFEF34DE2, 0xC6BCF05F, + 0xC27DEDE8, 0xCF3ECB31, 0xCBFFD686, 0xD5B88683, 0xD1799B34, + 0xDC3ABDED, 0xD8FBA05A, 0x690CE0EE, 0x6DCDFD59, 0x608EDB80, + 0x644FC637, 0x7A089632, 0x7EC98B85, 0x738AAD5C, 0x774BB0EB, + 0x4F040D56, 0x4BC510E1, 0x46863638, 0x42472B8F, 0x5C007B8A, + 0x58C1663D, 0x558240E4, 0x51435D53, 0x251D3B9E, 0x21DC2629, + 0x2C9F00F0, 0x285E1D47, 0x36194D42, 0x32D850F5, 0x3F9B762C, + 0x3B5A6B9B, 0x0315D626, 0x07D4CB91, 0x0A97ED48, 0x0E56F0FF, + 0x1011A0FA, 0x14D0BD4D, 0x19939B94, 0x1D528623, 0xF12F560E, + 0xF5EE4BB9, 0xF8AD6D60, 0xFC6C70D7, 0xE22B20D2, 0xE6EA3D65, + 0xEBA91BBC, 0xEF68060B, 0xD727BBB6, 0xD3E6A601, 0xDEA580D8, + 0xDA649D6F, 0xC423CD6A, 0xC0E2D0DD, 0xCDA1F604, 0xC960EBB3, + 0xBD3E8D7E, 0xB9FF90C9, 0xB4BCB610, 0xB07DABA7, 0xAE3AFBA2, + 0xAAFBE615, 0xA7B8C0CC, 0xA379DD7B, 0x9B3660C6, 0x9FF77D71, + 0x92B45BA8, 0x9675461F, 0x8832161A, 0x8CF30BAD, 0x81B02D74, + 0x857130C3, 0x5D8A9099, 0x594B8D2E, 0x5408ABF7, 0x50C9B640, + 0x4E8EE645, 0x4A4FFBF2, 0x470CDD2B, 0x43CDC09C, 0x7B827D21, + 0x7F436096, 0x7200464F, 0x76C15BF8, 0x68860BFD, 0x6C47164A, + 0x61043093, 0x65C52D24, 0x119B4BE9, 0x155A565E, 0x18197087, + 0x1CD86D30, 0x029F3D35, 0x065E2082, 0x0B1D065B, 0x0FDC1BEC, + 0x3793A651, 0x3352BBE6, 0x3E119D3F, 0x3AD08088, 0x2497D08D, + 0x2056CD3A, 0x2D15EBE3, 0x29D4F654, 0xC5A92679, 0xC1683BCE, + 0xCC2B1D17, 0xC8EA00A0, 0xD6AD50A5, 0xD26C4D12, 0xDF2F6BCB, + 0xDBEE767C, 0xE3A1CBC1, 0xE760D676, 0xEA23F0AF, 0xEEE2ED18, + 0xF0A5BD1D, 0xF464A0AA, 0xF9278673, 0xFDE69BC4, 0x89B8FD09, + 0x8D79E0BE, 0x803AC667, 0x84FBDBD0, 0x9ABC8BD5, 0x9E7D9662, + 0x933EB0BB, 0x97FFAD0C, 0xAFB010B1, 0xAB710D06, 0xA6322BDF, + 0xA2F33668, 0xBCB4666D, 0xB8757BDA, 0xB5365D03, 0xB1F740B4 +}; + +/* The name this program was run with. */ +char *program_name; + +/* Nonzero if any of the files read were the standard input. */ +int have_read_stdin; + +/* Calculate and print the checksum and length in bytes + of file FILE, or of the standard input if FILE is "-". + If PRINT_NAME is nonzero, print FILE next to the checksum and size. + Return 0 if successful, -1 if an error occurs. */ + +int +cksum (file, print_name) + char *file; + int print_name; +{ + unsigned char buf[BUFLEN]; + unsigned long crc = 0; + long length = 0; + long bytes_read; + register FILE *fp; + + if (!strcmp (file, "-")) + { + fp = stdin; + have_read_stdin = 1; + } + else + { + fp = fopen (file, "r"); + if (fp == NULL) + { + error (0, errno, "%s", file); + return -1; + } + } + + while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0) + { + unsigned char *cp = buf; + + length += bytes_read; + while (bytes_read--) + crc = (crc << 8) ^ crctab[((crc >> 24) ^ *(cp++)) & 0xFF]; + } + + if (ferror (fp)) + { + error (0, errno, "%s", file); + if (strcmp (file, "-")) + fclose (fp); + return -1; + } + + if (strcmp (file, "-") && fclose (fp) == EOF) + { + error (0, errno, "%s", file); + return -1; + } + + bytes_read = length; + while (bytes_read > 0) + { + crc = (crc << 8) ^ crctab[((crc >> 24) ^ bytes_read) & 0xFF]; + bytes_read >>= 8; + } + + crc = ~crc & 0xFFFFFFFF; + + printf ("%10lu %8ld", crc, length); + if (print_name) + printf (" %s", file); + putchar ('\n'); + + return 0; +} + +void +main (argc, argv) + int argc; + char **argv; +{ + int errors = 0; + + program_name = argv[0]; + have_read_stdin = 0; + + if (argc == 1) + { + if (cksum ("-", 0) < 0) + errors = 1; + } + else + { + int optind; + + for (optind = 1; optind < argc; ++optind) + if (cksum (argv[optind], 1) < 0) + errors = 1; + } + + if (have_read_stdin && fclose (stdin) == EOF) + error (1, errno, "-"); + exit (errors); +} + +#endif /* !CRCTAB */ diff --git a/src/comm.c b/src/comm.c new file mode 100644 index 0000000..4362b64 --- /dev/null +++ b/src/comm.c @@ -0,0 +1,221 @@ +/* comm -- compare two sorted files line by line. + Copyright (C) 1986, 1990, 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* Written by Richard Stallman and David MacKenzie. */ + +#include +#include +#include +#include "system.h" +#include "linebuffer.h" + +#define min(x, y) ((x) < (y) ? (x) : (y)) + +/* If nonzero, print lines that are found only in file 1. */ +int only_file_1; + +/* If nonzero, print lines that are found only in file 2. */ +int only_file_2; + +/* If nonzero, print lines that are found in both files. */ +int both; + +/* The name this program was run with. */ +char *program_name; + +int compare_files (); +void error (); +void writeline (); +void usage (); + +void +main (argc, argv) + int argc; + char *argv[]; +{ + int c; + + program_name = argv[0]; + + only_file_1 = 1; + only_file_2 = 1; + both = 1; + + while ((c = getopt (argc, argv, "123")) != EOF) + switch (c) + { + case '1': + only_file_1 = 0; + break; + + case '2': + only_file_2 = 0; + break; + + case '3': + both = 0; + break; + + default: + usage (); + } + + if (optind + 2 != argc) + usage (); + + exit (compare_files (argv + optind)); +} + +/* Compare INFILES[0] and INFILES[1]. + If either is "-", use the standard input for that file. + Assume that each input file is sorted; + merge them and output the result. + Return 0 if successful, 1 if any errors occur. */ + +int +compare_files (infiles) + char **infiles; +{ + /* For each file, we have one linebuffer in lb1. */ + struct linebuffer lb1[2]; + + /* thisline[i] points to the linebuffer holding the next available line + in file i, or is NULL if there are no lines left in that file. */ + struct linebuffer *thisline[2]; + + /* streams[i] holds the input stream for file i. */ + FILE *streams[2]; + + int i, ret = 0; + + /* Initialize the storage. */ + for (i = 0; i < 2; i++) + { + initbuffer (&lb1[i]); + thisline[i] = &lb1[i]; + streams[i] = strcmp (infiles[i], "-") + ? fopen (infiles[i], "r") : stdin; + if (!streams[i]) + { + error (0, errno, "%s", infiles[i]); + return 1; + } + + thisline[i] = readline (thisline[i], streams[i]); + } + + while (thisline[0] || thisline[1]) + { + int order; + + /* Compare the next available lines of the two files. */ + + if (!thisline[0]) + order = 1; + else if (!thisline[1]) + order = -1; + else + { + /* Cannot use bcmp -- it only returns a boolean value. */ + order = memcmp (thisline[0]->buffer, thisline[1]->buffer, + min (thisline[0]->length, thisline[1]->length)); + if (order == 0) + order = thisline[0]->length - thisline[1]->length; + } + + /* Output the line that is lesser. */ + if (order == 0) + writeline (thisline[1], stdout, 3); + else if (order > 0) + writeline (thisline[1], stdout, 2); + else + writeline (thisline[0], stdout, 1); + + /* Step the file the line came from. + If the files match, step both files. */ + if (order >= 0) + thisline[1] = readline (thisline[1], streams[1]); + if (order <= 0) + thisline[0] = readline (thisline[0], streams[0]); + } + + /* Free all storage and close all input streams. */ + for (i = 0; i < 2; i++) + { + free (lb1[i].buffer); + if (ferror (streams[i]) || fclose (streams[i]) == EOF) + { + error (0, errno, "%s", infiles[i]); + ret = 1; + } + } + if (ferror (stdout) || fclose (stdout) == EOF) + { + error (0, errno, "write error"); + ret = 1; + } + return ret; +} + +/* Output the line in linebuffer LINE to stream STREAM + provided the switches say it should be output. + CLASS is 1 for a line found only in file 1, + 2 for a line only in file 2, 3 for a line in both. */ + +void +writeline (line, stream, class) + struct linebuffer *line; + FILE *stream; + int class; +{ + switch (class) + { + case 1: + if (!only_file_1) + return; + break; + + case 2: + if (!only_file_2) + return; + /* Skip the tab stop for case 1, if we are printing case 1. */ + if (only_file_1) + putc ('\t', stream); + break; + + case 3: + if (!both) + return; + /* Skip the tab stop for case 1, if we are printing case 1. */ + if (only_file_1) + putc ('\t', stream); + /* Skip the tab stop for case 2, if we are printing case 2. */ + if (only_file_2) + putc ('\t', stream); + break; + } + + fwrite (line->buffer, sizeof (char), line->length, stream); + putc ('\n', stream); +} + +void +usage () +{ + fprintf (stderr, "Usage: %s [-123] file1 file2\n", program_name); + exit (1); +} diff --git a/src/csplit.c b/src/csplit.c new file mode 100644 index 0000000..56bffa3 --- /dev/null +++ b/src/csplit.c @@ -0,0 +1,1308 @@ +/* csplit - split a file into sections determined by context lines + Copyright (C) 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* Written by Stuart Kemp, cpsrk@groper.jcu.edu.au. + Modified by David MacKenzie, djm@gnu.ai.mit.edu. */ + +#include +#include +#include +#include +#include +#include "regex.h" +#include "system.h" + +#if !defined(USG) && !defined(STDC_HEADERS) +char *memchr (); +#endif + +#ifdef STDC_HEADERS +#include +#else +char *malloc (); +char *realloc (); +#endif + +void error (); + +void cleanup (); +void close_output_file (); +void create_output_file (); +void save_line_to_file (); +void usage (); + +#ifndef TRUE +#define FALSE 0 +#define TRUE 1 +#endif + +/* Increment size of area for control records. */ +#define ALLOC_SIZE 20 + +/* The default prefix for output file names. */ +#define DEFAULT_PREFIX "xx" + +typedef int boolean; + +/* A compiled pattern arg. */ +struct control +{ + char *regexpr; /* Non-compiled regular expression. */ + struct re_pattern_buffer re_compiled; /* Compiled regular expression. */ + int offset; /* Offset from regexp to split at. */ + int lines_required; /* Number of lines required. */ + int repeat; /* Repeat count. */ + int argnum; /* ARGV index. */ + boolean ignore; /* If true, produce no output (for regexp). */ +}; + +/* Initial size of data area in buffers. */ +#define START_SIZE 8191 + +/* Increment size for data area. */ +#define INCR_SIZE 2048 + +/* Number of lines kept in each node in line list. */ +#define CTRL_SIZE 80 + +#ifdef DEBUG +/* Some small values to test the algorithms. */ +#define START_SIZE 200 +#define INCR_SIZE 10 +#define CTRL_SIZE 1 +#endif + +/* A string with a length count. */ +struct cstring +{ + int len; + char *str; +}; + +/* Pointers to the beginnings of lines in the buffer area. + These structures are linked together if needed. */ +struct line +{ + unsigned used; /* Number of offsets used in this struct. */ + unsigned insert_index; /* Next offset to use when inserting line. */ + unsigned retrieve_index; /* Next index to use when retrieving line. */ + struct cstring starts[CTRL_SIZE]; /* Lines in the data area. */ + struct line *next; /* Next in linked list. */ +}; + +/* The structure to hold the input lines. + Contains a pointer to the data area and a list containing + pointers to the individual lines. */ +struct buffer_record +{ + unsigned bytes_alloc; /* Size of the buffer area. */ + unsigned bytes_used; /* Bytes used in the buffer area. */ + unsigned start_line; /* First line number in this buffer. */ + unsigned first_available; /* First line that can be retrieved. */ + unsigned num_lines; /* Number of complete lines in this buffer. */ + char *buffer; /* Data area. */ + struct line *line_start; /* Head of list of pointers to lines. */ + struct line *curr_line; /* The line start record currently in use. */ + struct buffer_record *next; +}; + +/* Input file descriptor. */ +int input_desc = 0; + +/* List of available buffers. */ +struct buffer_record *free_list = NULL; + +/* Start of buffer list. */ +struct buffer_record *head = NULL; + +/* Partially read line. */ +char *hold_area = NULL; + +/* Number of chars in `hold_area'. */ +unsigned hold_count = 0; + +/* Number of the last line in the buffers. */ +unsigned last_line_number = 0; + +/* Number of the line currently being examined. */ +unsigned current_line = 0; + +/* Number of the last line in the input file. */ +unsigned last_line_in_file = 0; + +/* If TRUE, we have read EOF. */ +boolean have_read_eof = FALSE; + +/* Name of output files. */ +char *filename_space = NULL; + +/* Prefix part of output file names. */ +char *prefix = NULL; + +/* Number of digits to use in output file names. */ +int digits = 2; + +/* Number of files created so far. */ +unsigned files_created = 0; + +/* Number of bytes written to current file. */ +unsigned bytes_written; + +/* Output file pointer. */ +FILE *output_stream = NULL; + +/* Perhaps it would be cleaner to pass arg values instead of indexes. */ +char **global_argv; + +/* If TRUE, do not print the count of bytes in each output file. */ +boolean suppress_count; + +/* If TRUE, remove output files on error. */ +boolean remove_files; + +/* The compiled pattern arguments, which determine how to split + the input file. */ +struct control *controls; + +/* Number of elements in `controls'. */ +unsigned control_used; + +/* The name this program was run with. */ +char *program_name; + +/* Allocate N bytes of memory dynamically, with error checking. */ + +char * +xmalloc (n) + unsigned n; +{ + char *p; + + p = malloc (n); + if (p == NULL) + { + error (0, 0, "virtual memory exhausted"); + cleanup (); + } + return p; +} + +/* Change the size of an allocated block of memory P to N bytes, + with error checking. + If P is NULL, run xmalloc. + If N is 0, run free and return NULL. */ + +char * +xrealloc (p, n) + char *p; + unsigned n; +{ + if (p == NULL) + return xmalloc (n); + if (n == 0) + { + free (p); + return 0; + } + p = realloc (p, n); + if (p == NULL) + { + error (0, 0, "virtual memory exhausted"); + cleanup (); + } + return p; +} + +/* Keep track of NUM chars of a partial line in buffer START. + These chars will be retrieved later when another large buffer is read. + It is not necessary to create a new buffer for these chars; instead, + we keep a pointer to the existing buffer. This buffer *is* on the + free list, and when the next buffer is obtained from this list + (even if it is this one), these chars will be placed at the + start of the new buffer. */ + +void +save_to_hold_area (start, num) + char *start; + unsigned num; +{ + hold_area = start; + hold_count = num; +} + +/* Read up to MAX chars from the input stream into DEST. + Return the number of chars read. */ + +int +read_input (dest, max) + char *dest; + unsigned max; +{ + int bytes_read; + + if (max == 0) + return 0; + + bytes_read = read (input_desc, dest, max); + + if (bytes_read == 0) + have_read_eof = TRUE; + + if (bytes_read < 0) + { + error (0, errno, "read error"); + cleanup (); + } + + return bytes_read; +} + +/* Initialize existing line record P. */ + +void +clear_line_control (p) + struct line *p; +{ + p->used = 0; + p->insert_index = 0; + p->retrieve_index = 0; +} + +/* Initialize all line records in B. */ + +void +clear_all_line_control (b) + struct buffer_record *b; +{ + struct line *l; + + for (l = b->line_start; l; l = l->next) + clear_line_control (l); +} + +/* Return a new, initialized line record. */ + +struct line * +new_line_control () +{ + struct line *p; + + p = (struct line *) xmalloc (sizeof (struct line)); + + p->next = NULL; + clear_line_control (p); + + return p; +} + +/* Record LINE_START, which is the address of the start of a line + of length LINE_LEN in the large buffer, in the lines buffer of B. */ + +void +keep_new_line (b, line_start, line_len) + struct buffer_record *b; + char *line_start; + int line_len; +{ + struct line *l; + + /* If there is no existing area to keep line info, get some. */ + if (b->line_start == NULL) + b->line_start = b->curr_line = new_line_control (); + + /* If existing area for lines is full, get more. */ + if (b->curr_line->used == CTRL_SIZE) + { + b->curr_line->next = new_line_control (); + b->curr_line = b->curr_line->next; + } + + l = b->curr_line; + + /* Record the start of the line, and update counters. */ + l->starts[l->insert_index].str = line_start; + l->starts[l->insert_index].len = line_len; + l->used++; + l->insert_index++; +} + +/* Scan the buffer in B for newline characters + and record the line start locations and lengths in B. + Return the number of lines found in this buffer. + + There may be an incomplete line at the end of the buffer; + a pointer is kept to this area, which will be used when + the next buffer is filled. */ + +unsigned +record_line_starts (b) + struct buffer_record *b; +{ + char *line_start; /* Start of current line. */ + char *line_end; /* End of each line found. */ + unsigned bytes_left; /* Length of incomplete last line. */ + unsigned lines; /* Number of lines found. */ + unsigned line_length; /* Length of each line found. */ + + if (b->bytes_used == 0) + return 0; + + lines = 0; + line_start = b->buffer; + bytes_left = b->bytes_used; + + for (;;) + { + line_end = memchr (line_start, '\n', bytes_left); + if (line_end == NULL) + break; + line_length = line_end - line_start + 1; + keep_new_line (b, line_start, line_length); + bytes_left -= line_length; + line_start = line_end + 1; + lines++; + } + + /* Check for an incomplete last line. */ + if (bytes_left) + { + if (have_read_eof) + { + keep_new_line (b, line_start, bytes_left); + lines++; + last_line_in_file = last_line_number + lines; + } + else + save_to_hold_area (line_start, bytes_left); + } + + b->num_lines = lines; + b->first_available = b->start_line = last_line_number + 1; + last_line_number += lines; + + return lines; +} + +/* Return a new buffer with room to store SIZE bytes, plus + an extra byte for safety. */ + +struct buffer_record * +create_new_buffer (size) + unsigned size; +{ + struct buffer_record *new_buffer; + + new_buffer = (struct buffer_record *) + xmalloc (sizeof (struct buffer_record)); + + new_buffer->buffer = (char *) xmalloc (size + 1); + + new_buffer->bytes_alloc = size; + new_buffer->line_start = new_buffer->curr_line = NULL; + + return new_buffer; +} + +/* Return a new buffer of at least MINSIZE bytes. If a buffer of at + least that size is currently free, use it, otherwise create a new one. */ + +struct buffer_record * +get_new_buffer (min_size) + unsigned min_size; +{ + struct buffer_record *p, *q; + struct buffer_record *new_buffer; /* Buffer to return. */ + unsigned alloc_size; /* Actual size that will be requested. */ + + alloc_size = START_SIZE; + while (min_size > alloc_size) + alloc_size += INCR_SIZE; + + if (free_list == NULL) + new_buffer = create_new_buffer (alloc_size); + else + { + /* Use first-fit to find a buffer. */ + p = new_buffer = NULL; + q = free_list; + + do + { + if (q->bytes_alloc >= min_size) + { + if (p == NULL) + free_list = q->next; + else + p->next = q->next; + break; + } + p = q; + q = q->next; + } + while (q); + + new_buffer = (q ? q : create_new_buffer (alloc_size)); + + new_buffer->curr_line = new_buffer->line_start; + clear_all_line_control (new_buffer); + } + + new_buffer->num_lines = 0; + new_buffer->bytes_used = 0; + new_buffer->start_line = new_buffer->first_available = last_line_number + 1; + new_buffer->next = NULL; + + return new_buffer; +} + +/* Add buffer BUF to the list of free buffers. */ + +void +free_buffer (buf) + struct buffer_record *buf; +{ + buf->next = free_list; + free_list = buf; +} + +/* Append buffer BUF to the linked list of buffers that contain + some data yet to be processed. */ + +void +save_buffer (buf) + struct buffer_record *buf; +{ + struct buffer_record *p; + + buf->next = NULL; + buf->curr_line = buf->line_start; + + if (head == NULL) + head = buf; + else + { + for (p = head; p->next; p = p->next) + /* Do nothing. */ ; + p->next = buf; + } +} + +/* Fill a buffer of input. + + Set the initial size of the buffer to a default. + Fill the buffer (from the hold area and input stream) + and find the individual lines. + If no lines are found (the buffer is too small to hold the next line), + release the current buffer (whose contents would have been put in the + hold area) and repeat the process with another large buffer until at least + one entire line has been read. + + Return TRUE if a new buffer was obtained, otherwise false + (in which case end-of-file must have been encountered). */ + +boolean +load_buffer () +{ + struct buffer_record *b; + unsigned bytes_wanted = START_SIZE; /* Minimum buffer size. */ + unsigned bytes_avail; /* Size of new buffer created. */ + unsigned lines_found; /* Number of lines in this new buffer. */ + char *p; /* Place to load into buffer. */ + + if (have_read_eof) + return FALSE; + + /* We must make the buffer at least as large as the amount of data + in the partial line left over from the last call. */ + if (bytes_wanted < hold_count) + bytes_wanted = hold_count; + + do + { + b = get_new_buffer (bytes_wanted); + bytes_avail = b->bytes_alloc; /* Size of buffer returned. */ + p = b->buffer; + + /* First check the `holding' area for a partial line. */ + if (hold_count) + { + if (p != hold_area) + bcopy (hold_area, p, hold_count); + p += hold_count; + b->bytes_used += hold_count; + bytes_avail -= hold_count; + hold_count = 0; + } + + b->bytes_used += (unsigned) read_input (p, bytes_avail); + + lines_found = record_line_starts (b); + bytes_wanted = b->bytes_alloc + INCR_SIZE; + if (!lines_found) + free_buffer (b); + } + while (!lines_found && !have_read_eof); + + if (lines_found) + save_buffer (b); + + return lines_found != 0; +} + +/* Return the line number of the first line that has not yet been retrieved. */ + +unsigned +get_first_line_in_buffer () +{ + if (head == NULL && !load_buffer ()) + error (1, errno, "input disappeared"); + + return head->first_available; +} + +/* Return a pointer to the logical first line in the buffer and make the + next line the logical first line. + Return NULL if there is no more input. */ + +struct cstring * +remove_line () +{ + struct cstring *line; /* Return value. */ + unsigned line_got; /* Number of the line retrieved. */ + struct line *l; /* For convenience. */ + + if (head == NULL && !load_buffer ()) + return NULL; + + if (current_line < head->first_available) + current_line = head->first_available; + + line_got = head->first_available++; + + l = head->curr_line; + + line = &l->starts[l->retrieve_index]; + + /* Advance index to next line. */ + if (++l->retrieve_index == l->used) + { + /* Go on to the next line record. */ + head->curr_line = l->next; + if (head->curr_line == NULL || head->curr_line->used == 0) + { + /* Go on to the next data block. */ + struct buffer_record *b = head; + head = head->next; + free_buffer (b); + } + } + + return line; +} + +/* Search the buffers for line LINENUM, reading more input if necessary. + Return a pointer to the line, or NULL if it is not found in the file. */ + +struct cstring * +find_line (linenum) + unsigned linenum; +{ + struct buffer_record *b; + + if (head == NULL && !load_buffer ()) + return NULL; + + if (linenum < head->start_line) + return NULL; + + for (b = head;;) + { + if (linenum < b->start_line + b->num_lines) + { + /* The line is in this buffer. */ + struct line *l; + unsigned offset; /* How far into the buffer the line is. */ + + l = b->line_start; + offset = linenum - b->start_line; + /* Find the control record. */ + while (offset >= CTRL_SIZE) + { + l = l->next; + offset -= CTRL_SIZE; + } + return &l->starts[offset]; + } + if (b->next == NULL && !load_buffer ()) + return NULL; + b = b->next; /* Try the next data block. */ + } +} + +/* Return TRUE if at least one more line is available for input. */ + +boolean +no_more_lines () +{ + return (find_line (current_line + 1) == NULL) ? TRUE : FALSE; +} + +/* Set the name of the input file to NAME and open it. */ + +void +set_input_file (name) + char *name; +{ + if (!strcmp (name, "-")) + input_desc = 0; + else + { + input_desc = open (name, O_RDONLY); + if (input_desc < 0) + error (1, errno, "%s", name); + } +} + +/* Write all lines from the beginning of the buffer up to, but + not including, line LAST_LINE, to the current output file. + If IGNORE is TRUE, do not output lines selected here. + ARGNUM is the index in ARGV of the current pattern. */ + +void +write_to_file (last_line, ignore, argnum) + unsigned last_line; + boolean ignore; + int argnum; +{ + struct cstring *line; + unsigned first_line; /* First available input line. */ + unsigned lines; /* Number of lines to output. */ + unsigned i; + + first_line = get_first_line_in_buffer (); + + if (first_line > last_line) + { + error (0, 0, "%s: line number out of range", global_argv[argnum]); + cleanup (); + } + + lines = last_line - first_line; + + for (i = 0; i < lines; i++) + { + line = remove_line (); + if (line == NULL) + { + error (0, 0, "%s: line number out of range", global_argv[argnum]); + cleanup (); + } + if (!ignore) + save_line_to_file (line); + } +} + +/* Output any lines left after all regexps have been processed. */ + +void +dump_rest_of_file () +{ + struct cstring *line; + + while ((line = remove_line ()) != NULL) + save_line_to_file (line); +} + +/* Handle an attempt to read beyond EOF under the control of record P, + on iteration REPETITION if nonzero. */ + +void +handle_line_error (p, repetition) + struct control *p; + int repetition; +{ + fprintf (stderr, "%s: `%d': line number out of range", + program_name, p->lines_required); + if (repetition) + fprintf (stderr, " on repetition %d\n", repetition); + else + fprintf (stderr, "\n"); + + cleanup (); +} + +/* Determine the line number that marks the end of this file, + then get those lines and save them to the output file. + P is the control record. + REPETITION is the repetition number. */ + +void +process_line_count (p, repetition) + struct control *p; + int repetition; +{ + unsigned linenum; + unsigned last_line_to_save = p->lines_required * (repetition + 1); + struct cstring *line; + + create_output_file (); + + linenum = get_first_line_in_buffer (); + + /* Check for requesting a line that has already been written out. + If this ever happens, it's due to a bug in csplit. */ + if (linenum >= last_line_to_save) + handle_line_error (p, repetition); + + while (linenum++ < last_line_to_save) + { + line = remove_line (); + if (line == NULL) + handle_line_error (p, repetition); + save_line_to_file (line); + } + + close_output_file (); + + /* Ensure that the line number specified is not 1 greater than + the number of lines in the file. */ + if (no_more_lines ()) + handle_line_error (p, repetition); +} + +void +regexp_error (p, repetition, ignore) + struct control *p; + int repetition; + boolean ignore; +{ + fprintf (stderr, "%s: `%s': match not found", + program_name, global_argv[p->argnum]); + + if (repetition) + fprintf (stderr, " on repetition %d\n", repetition); + else + fprintf (stderr, "\n"); + + if (!ignore) + { + dump_rest_of_file (); + close_output_file (); + } + cleanup (); +} + +/* Read the input until a line matches the regexp in P, outputting + it unless P->IGNORE is TRUE. + REPETITION is this repeat-count; 0 means the first time. */ + +void +process_regexp (p, repetition) + struct control *p; + int repetition; +{ + struct cstring *line; /* From input file. */ + register unsigned line_len; /* To make "$" in regexps work. */ + unsigned break_line; /* First line number of next file. */ + boolean ignore = p->ignore; /* If TRUE, skip this section. */ + int ret; + + if (!ignore) + create_output_file (); + + /* If there is no offset for the regular expression, or + it is positive, then it is not necessary to buffer the lines. */ + + if (p->offset >= 0) + { + for (;;) + { + line = find_line (++current_line); + if (line == NULL) + regexp_error (p, repetition, ignore); + line_len = line->len; + if (line->str[line_len - 1] == '\n') + line_len--; + ret = re_search (&p->re_compiled, line->str, line_len, + 0, line_len, (struct re_registers *) 0); + if (ret == -2) + { + error (0, 0, "error in regular expression search"); + cleanup (); + } + if (ret == -1) + { + line = remove_line (); + if (!ignore) + save_line_to_file (line); + } + else + break; + } + } + else + { + /* Buffer the lines. */ + for (;;) + { + line = find_line (++current_line); + if (line == NULL) + regexp_error (p, repetition, ignore); + line_len = line->len; + if (line->str[line_len - 1] == '\n') + line_len--; + ret = re_search (&p->re_compiled, line->str, line_len, + 0, line_len, (struct re_registers *) 0); + if (ret == -2) + { + error (0, 0, "error in regular expression search"); + cleanup (); + } + if (ret >= 0) + break; + } + } + + /* Account for any offset from this regexp. */ + break_line = current_line + p->offset; + + write_to_file (break_line, ignore, p->argnum); + + if (!ignore) + close_output_file (); + + current_line = break_line; +} + +/* Split the input file according to the control records we have built. */ + +void +split_file () +{ + register int i, j; + + for (i = 0; i < control_used; i++) + { + if (controls[i].regexpr) + { + for (j = 0; j <= controls[i].repeat; j++) + process_regexp (&controls[i], j); + } + else + { + for (j = 0; j <= controls[i].repeat; j++) + process_line_count (&controls[i], j); + } + } + + create_output_file (); + dump_rest_of_file (); + close_output_file (); +} + +/* Return the name of output file number NUM. */ + +char * +make_filename (num) + int num; +{ + sprintf (filename_space, "%s%0*d", prefix, digits, num); + return filename_space; +} + +/* Create the next output file. */ + +void +create_output_file () +{ + char *name; + + name = make_filename (files_created); + output_stream = fopen (name, "w"); + if (output_stream == NULL) + { + error (0, errno, "%s", name); + cleanup (); + } + files_created++; + bytes_written = 0; +} + +/* Delete all the files we have created. */ + +void +delete_all_files () +{ + int i; + char *name; + + for (i = 0; i < files_created; i++) + { + name = make_filename (i); + if (unlink (name)) + error (0, errno, "%s", name); + } +} + +/* Close the current output file and print the count + of characters in this file. */ + +void +close_output_file () +{ + if (output_stream) + { + if (fclose (output_stream) == EOF) + { + error (0, errno, "write error"); + cleanup (); + } + if (!suppress_count) + fprintf (stdout, "%d\n", bytes_written); + output_stream = NULL; + } +} + +/* Optionally remove files created so far; then exit. + Called when an error detected. */ + +void +cleanup () +{ + if (output_stream) + close_output_file (); + + if (remove_files) + delete_all_files (); + + exit (1); +} + +/* Save line LINE to the output file and + increment the character count for the current file. */ + +void +save_line_to_file (line) + struct cstring *line; +{ + fwrite (line->str, sizeof (char), line->len, output_stream); + bytes_written += line->len; +} + +/* Return a new, initialized control record. */ + +struct control * +new_control_record () +{ + static unsigned control_allocated = 0; /* Total space allocated. */ + register struct control *p; + + if (control_allocated == 0) + { + control_allocated = ALLOC_SIZE; + controls = (struct control *) + xmalloc (sizeof (struct control) * control_allocated); + } + else if (control_used == control_allocated) + { + control_allocated += ALLOC_SIZE; + controls = (struct control *) + xrealloc (controls, sizeof (struct control) * control_allocated); + } + p = &controls[control_used++]; + p->regexpr = NULL; + p->repeat = 0; + p->lines_required = 0; + p->offset = 0; + return p; +} + +/* Convert string NUM to an integer and put the value in *RESULT. + Return a TRUE if the string consists entirely of digits, + FALSE if not. */ + +boolean +string_to_number (result, num) + int *result; + char *num; +{ + register char ch; + register int val = 0; + + if (*num == '\0') + return FALSE; + + while (ch = *num++) + { + if (!isdigit (ch)) + return FALSE; + val = val * 10 + ch - '0'; + } + + *result = val; + return TRUE; +} + +/* Check if there is a numeric offset after a regular expression. + STR is the entire command line argument. + ARGNUM is the index in ARGV of STR. + P is the control record for this regular expression. + NUM is the numeric part of STR. */ + +void +check_for_offset (argnum, p, str, num) + int argnum; + struct control *p; + char *str; + char *num; +{ + if (*num != '-' && *num != '+') + error (1, 0, "%s: `+' or `-' expected after delimeter", str); + + if (!string_to_number (&p->offset, num + 1)) + error (1, 0, "%s: integer expected after `%c'", str, *num); + + if (*num == '-') + p->offset = -p->offset; +} + +/* Given that the first character of command line arg STR is '{', + make sure that the rest of the string is a valid repeat count + and store its value in P. + ARGNUM is the ARGV index of STR. */ + +void +parse_repeat_count (argnum, p, str) + int argnum; + struct control *p; + char *str; +{ + char *end; + + end = str + strlen (str) - 1; + if (*end != '}') + error (1, 0, "%s: `}' is required in repeat count", str); + *end = '\0'; + + if (!string_to_number (&p->repeat, str + 1)) + error (1, 0, "%s}: integer required between `{' and `}'", + global_argv[argnum]); + + *end = '}'; +} + +/* Extract the regular expression from STR and check for a numeric offset. + STR should start with the regexp delimiter character. + Return a new control record for the regular expression. + ARGNUM is the ARGV index of STR. + Unless IGNORE is TRUE, mark these lines for output. */ + +struct control * +extract_regexp (argnum, ignore, str) + int argnum; + boolean ignore; + char *str; +{ + int len; /* Number of chars in this regexp. */ + char delim = *str; + char *closing_delim; + struct control *p; + char *err; + + closing_delim = rindex (str + 1, delim); + if (closing_delim == NULL) + error (1, 0, "%s: closing delimeter `%c' missing", str, delim); + + len = closing_delim - str - 1; + p = new_control_record (); + p->argnum = argnum; + p->ignore = ignore; + + p->regexpr = (char *) xmalloc ((unsigned) (len + 1)); + strncpy (p->regexpr, str + 1, len); + p->re_compiled.allocated = len * 2; + p->re_compiled.buffer = (unsigned char *) xmalloc (p->re_compiled.allocated); + p->re_compiled.fastmap = xmalloc (256); + p->re_compiled.translate = 0; + err = re_compile_pattern (p->regexpr, len, &p->re_compiled); + if (err) + { + error (0, 0, "%s: invalid regular expression: %s", str, err); + cleanup (); + } + + if (closing_delim[1]) + check_for_offset (argnum, p, str, closing_delim + 1); + + return p; +} + +/* Extract the break patterns from args START through ARGC - 1 of ARGV. + After each pattern, check if the next argument is a repeat count. */ + +void +parse_patterns (argc, start, argv) + int argc; + int start; + char **argv; +{ + int i; /* Index into ARGV. */ + struct control *p; /* New control record created. */ + + for (i = start; i < argc; i++) + { + if (*argv[i] == '/' || *argv[i] == '%') + { + p = extract_regexp (i, *argv[i] == '%', argv[i]); + } + else + { + p = new_control_record (); + p->argnum = i; + if (!string_to_number (&p->lines_required, argv[i])) + error (1, 0, "%s: invalid pattern", argv[i]); + } + + if (i + 1 < argc && *argv[i + 1] == '{') + { + /* We have a repeat count. */ + i++; + parse_repeat_count (i, p, argv[i]); + } + } +} + +void +interrupt_handler () +{ + error (0, 0, "interrupted"); + cleanup (); +} + +struct option longopts[] = +{ + {"digits", 1, NULL, 'n'}, + {"quiet", 0, NULL, 's'}, + {"silent", 0, NULL, 's'}, + {"keep-files", 0, NULL, 'k'}, + {"prefix", 1, NULL, 'f'}, + {NULL, 0, NULL, 0} +}; + +void +main (argc, argv) + int argc; + char **argv; +{ + int optc; +#ifdef _POSIX_VERSION + struct sigaction oldact, newact; +#endif /* _POSIX_VERSION */ + + program_name = argv[0]; + global_argv = argv; + controls = NULL; + control_used = 0; + suppress_count = FALSE; + remove_files = TRUE; + prefix = DEFAULT_PREFIX; + +#ifdef _POSIX_VERSION + newact.sa_handler = interrupt_handler; + sigemptyset (&newact.sa_mask); + newact.sa_flags = 0; + + sigaction (SIGHUP, NULL, &oldact); + if (oldact.sa_handler != SIG_IGN) + sigaction (SIGHUP, &newact, NULL); + + sigaction (SIGINT, NULL, &oldact); + if (oldact.sa_handler != SIG_IGN) + sigaction (SIGINT, &newact, NULL); + + sigaction (SIGQUIT, NULL, &oldact); + if (oldact.sa_handler != SIG_IGN) + sigaction (SIGQUIT, &newact, NULL); + + sigaction (SIGTERM, NULL, &oldact); + if (oldact.sa_handler != SIG_IGN) + sigaction (SIGTERM, &newact, NULL); +#else /* !_POSIX_VERSION */ + if (signal (SIGHUP, SIG_IGN) != SIG_IGN) + signal (SIGHUP, interrupt_handler); + if (signal (SIGINT, SIG_IGN) != SIG_IGN) + signal (SIGINT, interrupt_handler); + if (signal (SIGQUIT, SIG_IGN) != SIG_IGN) + signal (SIGQUIT, interrupt_handler); + if (signal (SIGTERM, SIG_IGN) != SIG_IGN) + signal (SIGTERM, interrupt_handler); +#endif + + while ((optc = getopt_long (argc, argv, "f:kn:s", longopts, (int *) 0)) + != EOF) + switch (optc) + { + case 'f': + prefix = optarg; + break; + + case 'k': + remove_files = FALSE; + break; + + case 'n': + if (!string_to_number (&digits, optarg)) + error (1, 0, "%s: invalid number", optarg); + break; + + case 's': + suppress_count = TRUE; + break; + + default: + usage (); + } + + if (optind >= argc - 1) + usage (); + + filename_space = (char *) xmalloc (strlen (prefix) + digits + 2); + + set_input_file (argv[optind++]); + + parse_patterns (argc, optind, argv); + + split_file (); + + if (close (input_desc) < 0) + { + error (0, errno, "read error"); + cleanup (); + } + + exit (0); +} + +void +usage () +{ + fprintf (stderr, "\ +Usage: %s [-sk] [-f prefix] [-n digits] [--prefix=prefix]\n\ + [--digits=digits] [--quiet] [--silent] [--keep-files] file pattern...\n", + program_name); + exit (1); +} diff --git a/src/cut.c b/src/cut.c new file mode 100644 index 0000000..93808b0 --- /dev/null +++ b/src/cut.c @@ -0,0 +1,586 @@ +/* cut - remove parts of lines of files + Copyright (C) 1984 by David M. Ihnat + + This program is a total rewrite of the Bell Laboratories Unix(Tm) + command of the same name, as of System V. It contains no proprietary + code, and therefore may be used without violation of any proprietary + agreements whatsoever. However, you will notice that the program is + copyrighted by me. This is to assure the program does *not* fall + into the public domain. Thus, I may specify just what I am now: + This program may be freely copied and distributed, provided this notice + remains; it may not be sold for profit without express written consent of + the author. + Please note that I recreated the behavior of the Unix(Tm) 'cut' command + as faithfully as possible; however, I haven't run a full set of regression + tests. Thus, the user of this program accepts full responsibility for any + effects or loss; in particular, the author is not responsible for any losses, + explicit or incidental, that may be incurred through use of this program. + + I ask that any bugs (and, if possible, fixes) be reported to me when + possible. -David Ihnat (312) 784-4544 ignatz@homebru.chi.il.us + + POSIX changes, bug fixes, long-named options, and cleanup + by David MacKenzie . + + Options: + --bytes=byte-list + -b byte-list Print only the bytes in positions listed + in BYTE-LIST. + Tabs and backspaces are treated like any + other character; they take up 1 byte. + + --characters=character-list + -c character-list Print only characters in positions listed + in CHARACTER-LIST. + The same as -b for now, but + internationalization will change that. + Tabs and backspaces are treated like any + other character; they take up 1 character. + + --fields=field-list + -f field-list Print only the fields listed in FIELD-LIST. + Fields are separated by a TAB by default. + + --delimiter=delim + -d delim For -f, fields are separated by the first + character in DELIM instead of TAB. + + -n Do not split multibyte chars (no-op for now). + + --only-delimited + -s For -f, do not print lines that do not contain + the field separator character. + + The BYTE-LIST, CHARACTER-LIST, and FIELD-LIST are one or more numbers + or ranges separated by commas. The first byte, character, and field + are numbered 1. + + A FILE of `-' means standard input. */ + +#define _GNU_SOURCE +#include +#ifndef isblank +#define isblank(c) ((c) == ' ' || (c) == '\t') +#endif +#include +#include +#include +#include "system.h" + +#ifdef isascii +#define ISDIGIT(c) (isascii ((c)) && isdigit ((c))) +#else +#define ISDIGIT(c) (isdigit ((c))) +#endif + +char *xmalloc (); +char *xrealloc (); +int set_fields (); +int cut_file (); +void cut_stream (); +void cut_bytes (); +void cut_fields (); +void enlarge_line (); +void error (); +void invalid_list (); +void usage (); + +/* The number of elements allocated for the input line + and the byte or field number. + Enlarged as necessary. */ +int line_size; + +/* Processed output buffer. */ +char *outbuf; + +/* Where to save next char to output. */ +char *outbufptr; + +/* Raw line buffer for field mode. */ +char *inbuf; + +/* Where to save next input char. */ +char *inbufptr; + +/* What can be done about a byte or field. */ +enum field_action +{ + field_omit, + field_output +}; + +/* In byte mode, which bytes to output. + In field mode, which `delim'-separated fields to output. + Both bytes and fields are numbered starting with 1, + so the first element of `fields' is unused. */ +enum field_action *fields; + +enum operating_mode +{ + undefined_mode, + + /* Output characters that are in the given bytes. */ + byte_mode, + + /* Output the given delimeter-separated fields. */ + field_mode +}; + +enum operating_mode operating_mode; + +/* If nonzero, + for field mode, do not output lines containing no delimeter characters. */ +int delimited_lines_only; + +/* The delimeter character for field mode. */ +unsigned char delim; + +/* Nonzero if we have ever read standard input. */ +int have_read_stdin; + +/* The name this program was run with. */ +char *program_name; + +struct option longopts[] = +{ + {"bytes", 1, 0, 'b'}, + {"characters", 1, 0, 'c'}, + {"fields", 1, 0, 'f'}, + {"delimiter", 1, 0, 'd'}, + {"only-delimited", 0, 0, 's'}, + {0, 0, 0, 0} +}; + +void +main (argc, argv) + int argc; + char **argv; +{ + int optc, exit_status = 0; + + program_name = argv[0]; + + line_size = 512; + operating_mode = undefined_mode; + delimited_lines_only = 0; + delim = '\0'; + have_read_stdin = 0; + + fields = (enum field_action *) + xmalloc (line_size * sizeof (enum field_action)); + outbuf = (char *) xmalloc (line_size); + inbuf = (char *) xmalloc (line_size); + + for (optc = 0; optc < line_size; optc++) + fields[optc] = field_omit; + + while ((optc = getopt_long (argc, argv, "b:c:d:f:ns", longopts, (int *) 0)) + != EOF) + { + switch (optc) + { + case 'b': + case 'c': + /* Build the byte list. */ + if (operating_mode != undefined_mode) + usage (); + operating_mode = byte_mode; + if (set_fields (optarg) == 0) + error (2, 0, "no fields given"); + break; + + case 'f': + /* Build the field list. */ + if (operating_mode != undefined_mode) + usage (); + operating_mode = field_mode; + if (set_fields (optarg) == 0) + error (2, 0, "no fields given"); + break; + + case 'd': + /* New delimiter. */ + if (optarg[0] == '\0') + error (2, 0, "no delimiter given"); + if (optarg[1] != '\0') + error (2, 0, "delimiter must be a single character"); + delim = optarg[0]; + break; + + case 'n': + break; + + case 's': + delimited_lines_only++; + break; + + default: + usage (); + } + } + + if (operating_mode == undefined_mode) + usage (); + + if ((delimited_lines_only || delim != '\0') && operating_mode != field_mode) + usage (); + + if (delim == '\0') + delim = '\t'; + + if (optind == argc) + exit_status |= cut_file ("-"); + else + for (; optind < argc; optind++) + exit_status |= cut_file (argv[optind]); + + if (have_read_stdin && fclose (stdin) == EOF) + { + error (0, errno, "-"); + exit_status = 1; + } + if (ferror (stdout) || fclose (stdout) == EOF) + error (1, 0, "write error"); + + exit (exit_status); +} + +/* Select for printing the positions in `fields' that are listed in + byte or field specification FIELDSTR. FIELDSTR should be + composed of one or more numbers or ranges of numbers, separated by + blanks or commas. Incomplete ranges may be given: `-m' means + `1-m'; `n-' means `n' through end of line or last field. + + Return the number of fields selected. */ + +int +set_fields (fieldstr) + char *fieldstr; +{ + int initial = 1; /* Value of first number in a range. */ + int dash_found = 0; /* Nonzero if a '-' is found in this field. */ + int value = 0; /* If nonzero, a number being accumulated. */ + int fields_selected = 0; /* Number of fields selected so far. */ + /* If nonzero, index of first field in a range that goes to end of line. */ + int eol_range_start = 0; + + for (;;) + { + if (*fieldstr == '-') + { + /* Starting a range. */ + if (dash_found) + invalid_list (); + dash_found++; + fieldstr++; + + if (value) + { + if (value >= line_size) + enlarge_line (value); + initial = value; + value = 0; + } + else + initial = 1; + } + else if (*fieldstr == ',' || isblank (*fieldstr) || *fieldstr == '\0') + { + /* Ending the string, or this field/byte sublist. */ + if (dash_found) + { + dash_found = 0; + + /* A range. Possibilites: -n, m-n, n-. + In any case, `initial' contains the start of the range. */ + if (value == 0) + { + /* `n-'. From `initial' to end of line. */ + eol_range_start = initial; + fields_selected++; + } + else + { + /* `m-n' or `-n' (1-n). */ + if (value < initial) + invalid_list (); + + if (value >= line_size) + enlarge_line (value); + + /* Is there already a range going to end of line? */ + if (eol_range_start != 0) + { + /* Yes. Is the new sequence already contained + in the old one? If so, no processing is + necessary. */ + if (initial < eol_range_start) + { + /* No, the new sequence starts before the + old. Does the old range going to end of line + extend into the new range? */ + if (eol_range_start < value) + /* Yes. Simply move the end of line marker. */ + eol_range_start = initial; + else + { + /* No. A simple range, before and disjoint from + the range going to end of line. Fill it. */ + for (; initial <= value; initial++) + fields[initial] = field_output; + } + + /* In any case, some fields were selected. */ + fields_selected++; + } + } + else + { + /* There is no range going to end of line. */ + for (; initial <= value; initial++) + fields[initial] = field_output; + fields_selected++; + } + value = 0; + } + } + else if (value != 0) + { + /* A simple field number, not a range. */ + if (value >= line_size) + enlarge_line (value); + + fields[value] = field_output; + value = 0; + fields_selected++; + } + + if (*fieldstr == '\0') + { + /* If there was a range going to end of line, fill the + array from the end of line point. */ + if (eol_range_start) + for (initial = eol_range_start; initial < line_size; initial++) + fields[initial] = field_output; + + return fields_selected; + } + + fieldstr++; + } + else if (ISDIGIT (*fieldstr)) + { + value = 10 * value + *fieldstr - '0'; + fieldstr++; + } + else + invalid_list (); + } +} + +/* Process file FILE to standard output. + Return 0 if successful, 1 if not. */ + +int +cut_file (file) + char *file; +{ + FILE *stream; + + if (!strcmp (file, "-")) + { + have_read_stdin = 1; + stream = stdin; + } + else + { + stream = fopen (file, "r"); + if (stream == NULL) + { + error (0, errno, "%s", file); + return 1; + } + } + + cut_stream (stream); + + if (ferror (stream)) + { + error (0, errno, "%s", file); + return 1; + } + if (!strcmp (file, "-")) + clearerr (stream); /* Also clear EOF. */ + else if (fclose (stream) == EOF) + { + error (0, errno, "%s", file); + return 1; + } + return 0; +} + +void +cut_stream (stream) + FILE *stream; +{ + if (operating_mode == byte_mode) + cut_bytes (stream); + else + cut_fields (stream); +} + +/* Print the file open for reading on stream STREAM + with the bytes marked `field_omit' in `fields' removed from each line. */ + +void +cut_bytes (stream) + FILE *stream; +{ + register int c; /* Each character from the file. */ + int doneflag = 0; /* Nonzero if EOF reached. */ + int char_count; /* Number of chars in the line so far. */ + + while (doneflag == 0) + { + /* Start processing a line. */ + outbufptr = outbuf; + char_count = 0; + + do + { + c = getc (stream); + if (c == EOF) + { + doneflag++; + break; + } + + /* If this character is to be sent, stow it in the outbuffer. */ + + if (++char_count == line_size - 1) + enlarge_line (char_count); + + if (fields[char_count] == field_output || c == '\n') + *outbufptr++ = c; + } + while (c != '\n'); + + if (char_count) + fwrite (outbuf, sizeof (char), outbufptr - outbuf, stdout); + } +} + +/* Print the file open for reading on stream STREAM + with the fields marked `field_omit' in `fields' removed from each line. + All characters are initially stowed in the raw input buffer, until + at least one field has been found. */ + +void +cut_fields (stream) + FILE *stream; +{ + register int c; /* Each character from the file. */ + int doneflag = 0; /* Nonzero if EOF reached. */ + int char_count; /* Number of chars in line before any delim. */ + int fieldfound; /* Nonzero if any fields to print found. */ + int curr_field; /* Current index in `fields'. */ + + while (doneflag == 0) + { + char_count = 0; + fieldfound = 0; + curr_field = 1; + outbufptr = outbuf; + inbufptr = inbuf; + + do + { + c = getc (stream); + if (c == EOF) + { + doneflag++; + break; + } + + if (fields[curr_field] == field_output && c != '\n') + { + /* Working on a field. It, and its terminating + delimiter, go only into the processed buffer. */ + fieldfound = 1; + if (outbufptr - outbuf == line_size - 2) + enlarge_line (outbufptr - outbuf); + *outbufptr++ = c; + } + else if (fieldfound == 0) + { + if (++char_count == line_size - 1) + enlarge_line (char_count); + *inbufptr++ = c; + } + + if (c == delim && ++curr_field == line_size - 1) + enlarge_line (curr_field); + } + while (c != '\n'); + + if (fieldfound) + { + /* Something was found. Print it. */ + if (outbufptr[-1] == delim) + --outbufptr; /* Suppress trailing delimiter. */ + + fwrite (outbuf, sizeof (char), outbufptr - outbuf, stdout); + if (c == '\n') + putc (c, stdout); + } + else if (!delimited_lines_only && char_count) + /* A line with some characters, no delimiters, and no + suppression. Print it. */ + fwrite (inbuf, sizeof (char), inbufptr - inbuf, stdout); + } +} + +/* Extend the buffers to accomodate at least NEW_SIZE characters. */ + +void +enlarge_line (new_size) + int new_size; +{ + char *newp; + int i; + + new_size += 256; /* Leave some room to grow. */ + + fields = (enum field_action *) + xrealloc (fields, new_size * sizeof (enum field_action)); + + newp = (char *) xrealloc (outbuf, new_size); + outbufptr += newp - outbuf; + outbuf = newp; + + newp = (char *) xrealloc (inbuf, new_size); + inbufptr += newp - inbuf; + inbuf = newp; + + for (i = line_size; i < new_size; i++) + fields[i] = field_omit; + line_size = new_size; +} + +void +invalid_list () +{ + error (2, 0, "invalid byte or field list"); +} + +void +usage () +{ + fprintf (stderr, "\ +Usage: %s {-b byte-list,--bytes=byte-list} [-n] [file...]\n\ + %s {-c character-list,--characters=character-list} [file...]\n\ + %s {-f field-list,--fields=field-list} [-d delim] [-s]\n\ + [--delimiter=delim] [--only-delimited] [file...]\n", + program_name, program_name, program_name); + exit (2); +} diff --git a/src/expand.c b/src/expand.c new file mode 100644 index 0000000..8e47137 --- /dev/null +++ b/src/expand.c @@ -0,0 +1,377 @@ +/* expand - convert tabs to spaces + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* By default, convert all tabs to spaces. + Preserves backspace characters in the output; they decrement the + column count for tab calculations. + The default action is equivalent to -8. + + Options: + --tabs=tab1[,tab2[,...]] + -t tab1[,tab2[,...]] + -tab1[,tab2[,...]] If only one tab stop is given, set the tabs tab1 + spaces apart instead of the default 8. Otherwise, + set the tabs at columns tab1, tab2, etc. (numbered from + 0); replace any tabs beyond the tabstops given with + single spaces. + --initial + -i Only convert initial tabs on each line to spaces. + + David MacKenzie */ + +#define _GNU_SOURCE +#include +#ifndef isblank +#define isblank(c) ((c) == ' ' || (c) == '\t') +#endif +#include +#include +#include +#include "system.h" + +#ifdef isascii +#define ISDIGIT(c) (isascii ((c)) && isdigit ((c))) +#else +#define ISDIGIT(c) (isdigit ((c))) +#endif + +/* The number of bytes added at a time to the amount of memory + allocated for the output line. */ +#define OUTPUT_BLOCK 256 + +/* The number of bytes added at a time to the amount of memory + allocated for the list of tabstops. */ +#define TABLIST_BLOCK 256 + +char *xmalloc (); +char *xrealloc (); +void error (); + +FILE *next_file (); +void add_tabstop (); +void expand (); +void parse_tabstops (); +void usage (); +void validate_tabstops (); + +/* If nonzero, convert blanks even after nonblank characters have been + read on the line. */ +int convert_entire_line; + +/* If nonzero, the size of all tab stops. If zero, use `tab_list' instead. */ +int tab_size; + +/* Array of the explicit column numbers of the tab stops; + after `tab_list' is exhausted, each additional tab is replaced + by a space. The first column is column 0. */ +int *tab_list; + +/* The index of the first invalid element of `tab_list', + where the next element can be added. */ +int first_free_tab; + +/* Null-terminated array of input filenames. */ +char **file_list; + +/* Default for `file_list' if no files are given on the command line. */ +char *stdin_argv[] = +{ + "-", NULL +}; + +/* Nonzero if we have ever read standard input. */ +int have_read_stdin; + +/* Status to return to the system. */ +int exit_status; + +/* The name this program was run with. */ +char *program_name; + +struct option longopts[] = +{ + {"tabs", 1, NULL, 't'}, + {"initial", 0, NULL, 'i'}, + {NULL, 0, NULL, 0} +}; + +void +main (argc, argv) + int argc; + char **argv; +{ + int tabval = -1; /* Value of tabstop being read, or -1. */ + int c; /* Option character. */ + + have_read_stdin = 0; + exit_status = 0; + convert_entire_line = 1; + tab_list = NULL; + first_free_tab = 0; + program_name = argv[0]; + + while ((c = getopt_long (argc, argv, "it:,0123456789", longopts, (int *) 0)) + != EOF) + { + switch (c) + { + case '?': + usage (); + case 'i': + convert_entire_line = 0; + break; + case 't': + parse_tabstops (optarg); + break; + case ',': + add_tabstop (tabval); + tabval = -1; + break; + default: + if (tabval == -1) + tabval = 0; + tabval = tabval * 10 + c - '0'; + break; + } + } + + add_tabstop (tabval); + + validate_tabstops (tab_list, first_free_tab); + + if (first_free_tab == 0) + tab_size = 8; + else if (first_free_tab == 1) + tab_size = tab_list[0]; + else + tab_size = 0; + + if (optind == argc) + file_list = stdin_argv; + else + file_list = &argv[optind]; + + expand (); + + if (have_read_stdin && fclose (stdin) == EOF) + error (1, errno, "-"); + if (ferror (stdout) || fclose (stdout) == EOF) + error (1, 0, "write error"); + + exit (exit_status); +} + +/* Add the comma or blank separated list of tabstops STOPS + to the list of tabstops. */ + +void +parse_tabstops (stops) + char *stops; +{ + int tabval = -1; + + for (; *stops; stops++) + { + if (*stops == ',' || isblank (*stops)) + { + add_tabstop (tabval); + tabval = -1; + } + else if (ISDIGIT (*stops)) + { + if (tabval == -1) + tabval = 0; + tabval = tabval * 10 + *stops - '0'; + } + else + error (1, 0, "tab size contains an invalid character"); + } + + add_tabstop (tabval); +} + +/* Add tab stop TABVAL to the end of `tab_list', except + if TABVAL is -1, do nothing. */ + +void +add_tabstop (tabval) + int tabval; +{ + if (tabval == -1) + return; + if (first_free_tab % TABLIST_BLOCK == 0) + tab_list = (int *) xrealloc (tab_list, first_free_tab + TABLIST_BLOCK); + tab_list[first_free_tab++] = tabval; +} + +/* Check that the list of tabstops TABS, with ENTRIES entries, + contains only nonzero, ascending values. */ + +void +validate_tabstops (tabs, entries) + int *tabs; + int entries; +{ + int prev_tab = 0; + int i; + + for (i = 0; i < entries; i++) + { + if (tabs[i] == 0) + error (1, 0, "tab size cannot be 0"); + if (tabs[i] <= prev_tab) + error (1, 0, "tab sizes must be ascending"); + prev_tab = tabs[i]; + } +} + +/* Change tabs to spaces, writing to stdout. + Read each file in `file_list', in order. */ + +void +expand () +{ + FILE *fp; /* Input stream. */ + int c; /* Each input character. */ + int tab_index = 0; /* Index in `tab_list' of next tabstop. */ + int column = 0; /* Column on screen of the next char. */ + int next_tab_column; /* Column the next tab stop is on. */ + int convert = 1; /* If nonzero, perform translations. */ + + fp = next_file ((FILE *) NULL); + for (;;) + { + c = getc (fp); + if (c == EOF) + { + fp = next_file (fp); + if (fp == NULL) + break; /* No more files. */ + else + continue; + } + + if (c == '\n') + { + putchar (c); + tab_index = 0; + column = 0; + convert = 1; + } + else if (c == '\t' && convert) + { + if (tab_size == 0) + { + /* Do not let tab_index == first_free_tab; + stop when it is 1 less. */ + while (tab_index < first_free_tab - 1 + && column >= tab_list[tab_index]) + tab_index++; + next_tab_column = tab_list[tab_index]; + if (tab_index < first_free_tab - 1) + tab_index++; + if (column >= next_tab_column) + next_tab_column = column + 1; /* Ran out of tab stops. */ + } + else + { + next_tab_column = column + tab_size - column % tab_size; + } + while (column < next_tab_column) + { + putchar (' '); + ++column; + } + } + else + { + if (convert) + { + if (c == '\b') + { + if (column > 0) + --column; + } + else + { + ++column; + if (convert_entire_line == 0) + convert = 0; + } + } + putchar (c); + } + } +} + +/* Close the old stream pointer FP if it is non-NULL, + and return a new one opened to read the next input file. + Open a filename of `-' as the standard input. + Return NULL if there are no more input files. */ + +FILE * +next_file (fp) + FILE *fp; +{ + static char *prev_file; + char *file; + + if (fp) + { + if (ferror (fp)) + { + error (0, errno, "%s", prev_file); + exit_status = 1; + } + if (fp == stdin) + clearerr (fp); /* Also clear EOF. */ + else if (fclose (fp) == EOF) + { + error (0, errno, "%s", prev_file); + exit_status = 1; + } + } + + while ((file = *file_list++) != NULL) + { + if (file[0] == '-' && file[1] == '\0') + { + have_read_stdin = 1; + prev_file = file; + return stdin; + } + fp = fopen (file, "r"); + if (fp) + { + prev_file = file; + return fp; + } + error (0, errno, "%s", file); + exit_status = 1; + } + return NULL; +} + +void +usage () +{ + fprintf (stderr, "\ +Usage: %s [-tab1[,tab2[,...]]] [-t tab1[,tab2[,...]]] [-i]\n\ + [--tabs=tab1[,tab2[,...]]] [--initial] [file...]\n", + program_name); + exit (1); +} diff --git a/src/fold.c b/src/fold.c new file mode 100644 index 0000000..d5d4ae3 --- /dev/null +++ b/src/fold.c @@ -0,0 +1,250 @@ +/* fold -- wrap each input line to fit in specified width. + Copyright (C) 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* Written by David MacKenzie. */ + +#define _GNU_SOURCE +#include +#ifndef isblank +#define isblank(c) ((c) == ' ' || (c) == '\t') +#endif +#include +#include +#include +#include "system.h" + +char *xrealloc (); +int adjust_column (); +int fold_file (); +void error (); + +/* If nonzero, try to break on whitespace. */ +int break_spaces; + +/* If nonzero, count bytes, not column positions. */ +int count_bytes; + +/* If nonzero, at least one of the files we read was standard input. */ +int have_read_stdin; + +/* The name this program was run with. */ +char *program_name; + +struct option longopts[] = +{ + {"bytes", 0, NULL, 'b'}, + {"spaces", 0, NULL, 's'}, + {"width", 1, NULL, 'w'}, + {NULL, 0, NULL, 0} +}; + +void +main (argc, argv) + int argc; + char **argv; +{ + int width = 80; + int i; + int optc; + int errs = 0; + + program_name = argv[0]; + break_spaces = count_bytes = have_read_stdin = 0; + + while ((optc = getopt_long (argc, argv, "bsw:", longopts, (int *) 0)) + != EOF) + { + switch (optc) + { + case 'b': /* Count bytes rather than columns. */ + count_bytes = 1; + break; + + case 's': /* Break at word boundaries. */ + break_spaces = 1; + break; + + case 'w': /* Line width. */ + width = atoi (optarg); + if (width < 1) + error (1, 0, "%s: invalid line width", optarg); + break; + + default: + fprintf (stderr, "\ +Usage: %s [-bs] [-w width] [--bytes] [--spaces] [--width=width] [file...]\n", + argv[0]); + exit (1); + } + } + + if (argc == optind) + errs |= fold_file ("-", width); + else + for (i = optind; i < argc; i++) + errs |= fold_file (argv[i], width); + + if (have_read_stdin && fclose (stdin) == EOF) + error (1, errno, "-"); + if (fclose (stdout) == EOF) + error (1, errno, "write error"); + + exit (errs); +} + +/* Fold file FILENAME, or standard input if FILENAME is "-", + to stdout, with maximum line length WIDTH. + Return 0 if successful, 1 if an error occurs. */ + +int +fold_file (filename, width) + char *filename; + int width; +{ + FILE *istream; + register int c; + int column = 0; /* Screen column where next char will go. */ + int offset_out = 0; /* Index in `line_out' for next char. */ + static char *line_out = NULL; + static size_t allocated_out = 0; + + if (!strcmp (filename, "-")) + { + istream = stdin; + have_read_stdin = 1; + } + else + istream = fopen (filename, "r"); + + if (istream == NULL) + { + error (0, errno, "%s", filename); + return 1; + } + + while ((c = getc (istream)) != EOF) + { + if (offset_out + 1 >= allocated_out) + { + allocated_out += 1024; + line_out = xrealloc (line_out, allocated_out); + } + + if (c == '\n') + { + line_out[offset_out++] = c; + fwrite (line_out, sizeof (char), offset_out, stdout); + column = offset_out = 0; + continue; + } + + rescan: + column = adjust_column (column, c); + + if (column > width) + { + /* This character would make the line too long. + Print the line plus a newline, and make this character + start the next line. */ + if (break_spaces) + { + /* Look for the last blank. */ + int logical_end; + + for (logical_end = offset_out - 1; logical_end >= 0; + logical_end--) + if (isblank (line_out[logical_end])) + break; + if (logical_end >= 0) + { + int i; + + /* Found a blank. Don't output the part after it. */ + logical_end++; + fwrite (line_out, sizeof (char), logical_end, stdout); + putchar ('\n'); + /* Move the remainder to the beginning of the next line. + The areas being copied here might overlap. */ + bcopy (line_out + logical_end, line_out, + offset_out - logical_end); + offset_out -= logical_end; + for (column = i = 0; i < offset_out; i++) + column = adjust_column (column, line_out[i]); + goto rescan; + } + } + line_out[offset_out++] = '\n'; + fwrite (line_out, sizeof (char), offset_out, stdout); + column = offset_out = 0; + goto rescan; + } + + line_out[offset_out++] = c; + } + + if (offset_out) + fwrite (line_out, sizeof (char), offset_out, stdout); + + if (ferror (istream)) + { + error (0, errno, "%s", filename); + if (strcmp (filename, "-")) + fclose (istream); + return 1; + } + if (strcmp (filename, "-") && fclose (istream) == EOF) + { + error (0, errno, "%s", filename); + return 1; + } + + if (ferror (stdout)) + { + error (0, errno, "write error"); + return 1; + } + + return 0; +} + +/* Assuming the current column is COLUMN, return the column that + printing C will move the cursor to. + The first column is 0. */ + +int +adjust_column (column, c) + int column; + char c; +{ + if (!count_bytes) + { + if (c == '\b') + { + if (column > 0) + column--; + } + else if (c == '\r') + column = 0; + else if (c == '\t') + column = column + 8 - column % 8; + else /* if (isprint (c)) */ + column++; + } + else + column++; + return column; +} diff --git a/src/head.c b/src/head.c new file mode 100644 index 0000000..0302b60 --- /dev/null +++ b/src/head.c @@ -0,0 +1,380 @@ +/* head -- output first part of file(s) + Copyright (C) 1989, 1990, 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* Options: + -b Print first N 512-byte blocks. + -c, --bytes=N[bkm] Print first N bytes + [or 512-byte blocks, kilobytes, or megabytes]. + -k Print first N kilobytes. + -N, -l, -n, --lines=N Print first N lines. + -m Print first N megabytes. + -q, --quiet, --silent Never print filename headers. + -v, --verbose Always print filename headers. + + Reads from standard input if no files are given or when a filename of + ``-'' is encountered. + By default, filename headers are printed only if more than one file + is given. + By default, prints the first 10 lines (head -n 10). + + David MacKenzie */ + +#include +#include +#include +#include +#include "system.h" + +#ifdef isascii +#define ISDIGIT(c) (isascii ((c)) && isdigit ((c))) +#else +#define ISDIGIT(c) (isdigit ((c))) +#endif + +/* Number of lines/chars/blocks to head. */ +#define DEFAULT_NUMBER 10 + +/* Size of atomic reads. */ +#define BUFSIZE (512 * 8) + +/* Number of bytes per item we are printing. + If 0, head in lines. */ +int unit_size; + +/* If nonzero, print filename headers. */ +int print_headers; + +/* When to print the filename banners. */ +enum header_mode +{ + multiple_files, always, never +}; + +int head (); +int head_bytes (); +int head_file (); +int head_lines (); +long atou (); +void error (); +void parse_unit (); +void usage (); +void write_header (); +void xwrite (); + +/* The name this program was run with. */ +char *program_name; + +/* Have we ever read standard input? */ +int have_read_stdin; + +struct option long_options[] = +{ + {"bytes", 1, NULL, 'c'}, + {"lines", 1, NULL, 'n'}, + {"quiet", 0, NULL, 'q'}, + {"silent", 0, NULL, 'q'}, + {"verbose", 0, NULL, 'v'}, + {NULL, 0, NULL, 0} +}; + +void +main (argc, argv) + int argc; + char **argv; +{ + enum header_mode header_mode = multiple_files; + int exit_status = 0; + long number = -1; /* Number of items to print (-1 if undef.). */ + int c; /* Option character. */ + + program_name = argv[0]; + have_read_stdin = 0; + unit_size = 0; + print_headers = 0; + + if (argc > 1 && argv[1][0] == '-' && ISDIGIT (argv[1][1])) + { + /* Old option syntax; a dash, one or more digits, and one or + more option letters. Move past the number. */ + for (number = 0, ++argv[1]; ISDIGIT (*argv[1]); ++argv[1]) + number = number * 10 + *argv[1] - '0'; + /* Parse any appended option letters. */ + while (*argv[1]) + { + switch (*argv[1]) + { + case 'b': + unit_size = 512; + break; + + case 'c': + unit_size = 1; + break; + + case 'k': + unit_size = 1024; + break; + + case 'l': + unit_size = 0; + break; + + case 'm': + unit_size = 1048576; + break; + + case 'q': + header_mode = never; + break; + + case 'v': + header_mode = always; + break; + + default: + error (0, 0, "unrecognized option `-%c'", *argv[1]); + usage (); + } + ++argv[1]; + } + /* Make the options we just parsed invisible to getopt. */ + argv[1] = argv[0]; + argv++; + argc--; + } + + while ((c = getopt_long (argc, argv, "c:n:qv", long_options, (int *) 0)) + != EOF) + { + switch (c) + { + case 'c': + unit_size = 1; + parse_unit (optarg); + goto getnum; + case 'n': + unit_size = 0; + getnum: + number = atou (optarg); + if (number == -1) + error (1, 0, "invalid number `%s'", optarg); + break; + + case 'q': + header_mode = never; + break; + + case 'v': + header_mode = always; + break; + + default: + usage (); + } + } + + if (number == -1) + number = DEFAULT_NUMBER; + + if (unit_size > 1) + number *= unit_size; + + if (header_mode == always + || (header_mode == multiple_files && optind < argc - 1)) + print_headers = 1; + + if (optind == argc) + exit_status |= head_file ("-", number); + + for (; optind < argc; ++optind) + exit_status |= head_file (argv[optind], number); + + if (have_read_stdin && close (0) < 0) + error (1, errno, "-"); + if (close (1) < 0) + error (1, errno, "write error"); + + exit (exit_status); +} + +int +head_file (filename, number) + char *filename; + long number; +{ + int fd; + + if (!strcmp (filename, "-")) + { + have_read_stdin = 1; + filename = "standard input"; + if (print_headers) + write_header (filename); + return head (filename, 0, number); + } + else + { + fd = open (filename, O_RDONLY); + if (fd >= 0) + { + int errors; + + if (print_headers) + write_header (filename); + errors = head (filename, fd, number); + if (close (fd) == 0) + return errors; + } + error (0, errno, "%s", filename); + return 1; + } +} + +void +write_header (filename) + char *filename; +{ + static int first_file = 1; + + if (first_file) + { + xwrite (1, "==> ", 4); + first_file = 0; + } + else + xwrite (1, "\n==> ", 5); + xwrite (1, filename, strlen (filename)); + xwrite (1, " <==\n", 5); +} + +int +head (filename, fd, number) + char *filename; + int fd; + long number; +{ + if (unit_size) + return head_bytes (filename, fd, number); + else + return head_lines (filename, fd, number); +} + +int +head_bytes (filename, fd, bytes_to_write) + char *filename; + int fd; + long bytes_to_write; +{ + char buffer[BUFSIZE]; + int bytes_read; + + while (bytes_to_write) + { + bytes_read = read (fd, buffer, BUFSIZE); + if (bytes_read == -1) + { + error (0, errno, "%s", filename); + return 1; + } + if (bytes_read == 0) + break; + if (bytes_read > bytes_to_write) + bytes_read = bytes_to_write; + xwrite (1, buffer, bytes_read); + bytes_to_write -= bytes_read; + } + return 0; +} + +int +head_lines (filename, fd, lines_to_write) + char *filename; + int fd; + long lines_to_write; +{ + char buffer[BUFSIZE]; + int bytes_read; + int bytes_to_write; + + while (lines_to_write) + { + bytes_read = read (fd, buffer, BUFSIZE); + if (bytes_read == -1) + { + error (0, errno, "%s", filename); + return 1; + } + if (bytes_read == 0) + break; + bytes_to_write = 0; + while (bytes_to_write < bytes_read) + if (buffer[bytes_to_write++] == '\n' && --lines_to_write == 0) + break; + xwrite (1, buffer, bytes_to_write); + } + return 0; +} + +void +parse_unit (str) + char *str; +{ + int arglen = strlen (str); + + if (arglen == 0) + return; + + switch (str[arglen - 1]) + { + case 'b': + unit_size = 512; + str[arglen - 1] = '\0'; + break; + case 'k': + unit_size = 1024; + str[arglen - 1] = '\0'; + break; + case 'm': + unit_size = 1048576; + str[arglen - 1] = '\0'; + break; + } +} + +/* Convert STR, a string of ASCII digits, into an unsigned integer. + Return -1 if STR does not represent a valid unsigned integer. */ + +long +atou (str) + char *str; +{ + int value; + + for (value = 0; ISDIGIT (*str); ++str) + value = value * 10 + *str - '0'; + return *str ? -1 : value; +} + +void +usage () +{ + fprintf (stderr, "\ +Usage: %s [-c N[bkm]] [-n N] [-qv] [--bytes=N[bkm]] [--lines=N]\n\ + [--quiet] [--silent] [--verbose] [file...]\n\ + %s [-Nbcklmqv] [file...]\n", program_name, program_name); + exit (1); +} diff --git a/src/join.c b/src/join.c new file mode 100644 index 0000000..9ac82e0 --- /dev/null +++ b/src/join.c @@ -0,0 +1,690 @@ +/* join - join lines of two files on a common field + Copyright (C) 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Written by Mike Haertel, mike@gnu.ai.mit.edu. */ + +#define _GNU_SOURCE +#include +#ifndef isblank +#define isblank(c) ((c) == ' ' || (c) == '\t') +#endif +#include +#include +#include +#include "system.h" + +#ifdef isascii +#define ISSPACE(c) (isascii(c) && isspace(c)) +#define ISDIGIT(c) (isascii(c) && isdigit(c)) +#else +#define ISSPACE(c) isspace(c) +#define ISDIGIT(c) isdigit(c) +#endif + +char *xmalloc (); +char *xrealloc (); +void error (); +static void usage (); + +#define min(A, B) ((A) < (B) ? (A) : (B)) + +/* An element of the list describing the format of each + output line. */ +struct outlist +{ + int file; /* File to take field from (1 or 2). */ + int field; /* Field number to print. */ + struct outlist *next; +}; + +/* A field of a line. */ +struct field +{ + char *beg; /* First character in field. */ + char *lim; /* Character after last character in field. */ +}; + +/* A line read from an input file. Newlines are not stored. */ +struct line +{ + char *beg; /* First character in line. */ + char *lim; /* Character after last character in line. */ + int nfields; /* Number of elements in `fields'. */ + struct field *fields; +}; + +/* One or more consecutive lines read from a file that all have the + same join field value. */ +struct seq +{ + int count; /* Elements used in `lines'. */ + int alloc; /* Elements allocated in `lines'. */ + struct line *lines; +}; + +/* If nonzero, print unpairable lines in file 1 or 2. */ +static int print_unpairables_1, print_unpairables_2; + +/* If nonzero, print pairable lines. */ +static int print_pairables; + +/* Empty output field filler. */ +static char *empty_filler; + +/* Field to join on. */ +static int join_field_1, join_field_2; + +/* List of fields to print. */ +struct outlist *outlist; + +/* Last element in `outlist', where a new element can be added. */ +struct outlist *outlist_end; + +/* Tab character separating fields; if this is NUL fields are separated + by any nonempty string of white space, otherwise by exactly one + tab character. */ +static char tab; + +/* The name this program was run with. */ +char *program_name; + +/* Fill in the `fields' structure in LINE. */ + +static void +xfields (line) + struct line *line; +{ + static int nfields = 2; + int i; + register char *ptr, *lim; + + line->fields = (struct field *) xmalloc (nfields * sizeof (struct field)); + + ptr = line->beg; + lim = line->lim; + + for (i = 0; ptr < lim; ++i) + { + if (i == nfields) + { + nfields *= 2; + line->fields = (struct field *) + xrealloc ((char *) line->fields, nfields * sizeof (struct field)); + } + if (tab) + { + line->fields[i].beg = ptr; + while (ptr < lim && *ptr != tab) + ++ptr; + line->fields[i].lim = ptr; + if (ptr < lim) + ++ptr; + } + else + { + line->fields[i].beg = ptr; + while (ptr < lim && !ISSPACE (*ptr)) + ++ptr; + line->fields[i].lim = ptr; + while (ptr < lim && ISSPACE (*ptr)) + ++ptr; + } + } + + line->nfields = i; +} + +/* Read a line from FP into LINE and split it into fields. + Return 0 if EOF, 1 otherwise. */ + +static int +get_line (fp, line) + FILE *fp; + struct line *line; +{ + static int linesize = 80; + int c, i; + char *ptr; + + if (feof (fp)) + return 0; + + ptr = xmalloc (linesize); + + for (i = 0; (c = getc (fp)) != EOF && c != '\n'; ++i) + { + if (i == linesize) + { + linesize *= 2; + ptr = xrealloc (ptr, linesize); + } + ptr[i] = c; + } + + if (c == EOF && i == 0) + { + free (ptr); + return 0; + } + + line->beg = ptr; + line->lim = line->beg + i; + xfields (line); + return 1; +} + +static void +freeline (line) + struct line *line; +{ + free ((char *) line->fields); + free (line->beg); +} + +static void +initseq (seq) + struct seq *seq; +{ + seq->count = 0; + seq->alloc = 1; + seq->lines = (struct line *) xmalloc (seq->alloc * sizeof (struct line)); +} + +/* Read a line from FP and add it to SEQ. Return 0 if EOF, 1 otherwise. */ + +static int +getseq (fp, seq) + FILE *fp; + struct seq *seq; +{ + if (seq->count == seq->alloc) + { + seq->alloc *= 2; + seq->lines = (struct line *) + xrealloc ((char *) seq->lines, seq->alloc * sizeof (struct line)); + } + + if (get_line (fp, &seq->lines[seq->count])) + { + ++seq->count; + return 1; + } + return 0; +} + +static void +delseq (seq) + struct seq *seq; +{ + free ((char *) seq->lines); +} + +/* Return <0 if the join field in LINE1 compares less than the one in LINE2; + >0 if it compares greater; 0 if it compares equal. */ + +static int +keycmp (line1, line2) + struct line *line1; + struct line *line2; +{ + char *beg1, *beg2; /* Start of field to compare in each file. */ + int len1, len2; /* Length of fields to compare. */ + int diff; + + if (join_field_1 < line1->nfields) + { + beg1 = line1->fields[join_field_1].beg; + len1 = line1->fields[join_field_1].lim + - line1->fields[join_field_1].beg; + } + else + { + beg1 = NULL; + len1 = 0; + } + + if (join_field_2 < line2->nfields) + { + beg2 = line2->fields[join_field_2].beg; + len2 = line2->fields[join_field_2].lim + - line2->fields[join_field_2].beg; + } + else + { + beg2 = NULL; + len2 = 0; + } + + if (len1 == 0) + return len2 == 0 ? 0 : -1; + if (len2 == 0) + return 1; + diff = memcmp (beg1, beg2, min (len1, len2)); + if (diff) + return diff; + return len1 - len2; +} + +/* Print field N of LINE if it exists and is nonempty, otherwise + `empty_filler' if it is nonempty. */ + +static void +prfield (n, line) + int n; + struct line *line; +{ + int len; + + if (n < line->nfields) + { + len = line->fields[n].lim - line->fields[n].beg; + if (len) + fwrite (line->fields[n].beg, 1, len, stdout); + else if (empty_filler) + fputs (empty_filler, stdout); + } + else if (empty_filler) + fputs (empty_filler, stdout); +} + +/* Print LINE, with its fields separated by `tab'. */ + +static void +prline (line) + struct line *line; +{ + int i; + + for (i = 0; i < line->nfields; ++i) + { + prfield (i, line); + if (i == line->nfields - 1) + putchar ('\n'); + else + putchar (tab ? tab : ' '); + } +} + +/* Print the join of LINE1 and LINE2. */ + +static void +prjoin (line1, line2) + struct line *line1; + struct line *line2; +{ + if (outlist) + { + struct outlist *o; + + prfield (outlist->field - 1, outlist->file == 1 ? line1 : line2); + for (o = outlist->next; o; o = o->next) + { + putchar (tab ? tab : ' '); + prfield (o->field - 1, o->file == 1 ? line1 : line2); + } + putchar ('\n'); + } + else + { + int i; + + prfield (join_field_1, line1); + for (i = 0; i < join_field_1 && i < line1->nfields; ++i) + { + putchar (tab ? tab : ' '); + prfield (i, line1); + } + for (i = join_field_1 + 1; i < line1->nfields; ++i) + { + putchar (tab ? tab : ' '); + prfield (i, line1); + } + + for (i = 0; i < join_field_2 && i < line2->nfields; ++i) + { + putchar (tab ? tab : ' '); + prfield (i, line2); + } + for (i = join_field_2 + 1; i < line2->nfields; ++i) + { + putchar (tab ? tab : ' '); + prfield (i, line2); + } + putchar ('\n'); + } +} + +/* Print the join of the files in FP1 and FP2. */ + +static void +join (fp1, fp2) + FILE *fp1; + FILE *fp2; +{ + struct seq seq1, seq2; + struct line line; + int diff, i, j, eof1, eof2; + + /* Read the first line of each file. */ + initseq (&seq1); + getseq (fp1, &seq1); + initseq (&seq2); + getseq (fp2, &seq2); + + while (seq1.count && seq2.count) + { + diff = keycmp (&seq1.lines[0], &seq2.lines[0]); + if (diff < 0) + { + if (print_unpairables_1) + prline (&seq1.lines[0]); + freeline (&seq1.lines[0]); + seq1.count = 0; + getseq (fp1, &seq1); + continue; + } + if (diff > 0) + { + if (print_unpairables_2) + prline (&seq2.lines[0]); + freeline (&seq2.lines[0]); + seq2.count = 0; + getseq (fp2, &seq2); + continue; + } + + /* Keep reading lines from file1 as long as they continue to + match the current line from file2. */ + eof1 = 0; + do + if (!getseq (fp1, &seq1)) + { + eof1 = 1; + ++seq1.count; + break; + } + while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0])); + + /* Keep reading lines from file2 as long as they continue to + match the current line from file1. */ + eof2 = 0; + do + if (!getseq (fp2, &seq2)) + { + eof2 = 1; + ++seq2.count; + break; + } + while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1])); + + if (print_pairables) + { + for (i = 0; i < seq1.count - 1; ++i) + for (j = 0; j < seq2.count - 1; ++j) + prjoin (&seq1.lines[i], &seq2.lines[j]); + } + + for (i = 0; i < seq1.count - 1; ++i) + freeline (&seq1.lines[i]); + if (!eof1) + { + seq1.lines[0] = seq1.lines[seq1.count - 1]; + seq1.count = 1; + } + else + seq1.count = 0; + + for (i = 0; i < seq2.count - 1; ++i) + freeline (&seq2.lines[i]); + if (!eof2) + { + seq2.lines[0] = seq2.lines[seq2.count - 1]; + seq2.count = 1; + } + else + seq2.count = 0; + } + + if (print_unpairables_1 && seq1.count) + { + prline (&seq1.lines[0]); + freeline (&seq1.lines[0]); + while (get_line (fp1, &line)) + { + prline (&line); + freeline (&line); + } + } + + if (print_unpairables_2 && seq2.count) + { + prline (&seq2.lines[0]); + freeline (&seq2.lines[0]); + while (get_line (fp2, &line)) + { + prline (&line); + freeline (&line); + } + } + + delseq (&seq1); + delseq (&seq2); +} + +/* Add a field spec for field FIELD of file FILE to `outlist' and return 1, + unless either argument is invalid; then just return 0. */ + +static int +add_field (file, field) + int file; + int field; +{ + struct outlist *o; + + if (file < 1 || file > 2 || field < 1) + return 0; + o = (struct outlist *) xmalloc (sizeof (struct outlist)); + o->file = file; + o->field = field; + o->next = NULL; + + /* Add to the end of the list so the fields are in the right order. */ + if (outlist == NULL) + outlist = o; + else + outlist_end->next = o; + outlist_end = o; + + return 1; +} + +/* Add the comma or blank separated field spec(s) in STR to `outlist'. + Return the number of fields added. */ + +static int +add_field_list (str) + char *str; +{ + int added = 0; + int file = -1, field = -1; + int dot_found = 0; + + for (; *str; str++) + { + if (*str == ',' || isblank (*str)) + { + added += add_field (file, field); + file = field = -1; + dot_found = 0; + } + else if (*str == '.') + dot_found = 1; + else if (ISDIGIT (*str)) + { + if (!dot_found) + { + if (file == -1) + file = 0; + file = file * 10 + *str - '0'; + } + else + { + if (field == -1) + field = 0; + field = field * 10 + *str - '0'; + } + } + else + return 0; + } + + added += add_field (file, field); + return added; +} + +/* When using getopt_long_only, no long option can start with + a character that is a short option. */ +static struct option longopts[] = +{ + {"j", 1, NULL, 'j'}, + {"j1", 1, NULL, '1'}, + {"j2", 1, NULL, '2'}, + {NULL, 0, NULL, 0} +}; + +void +main (argc, argv) + int argc; + char *argv[]; +{ + char *names[2]; + FILE *fp1, *fp2; + int optc, prev_optc = 0, nfiles, val; + + program_name = argv[0]; + nfiles = 0; + print_pairables = 1; + + while ((optc = getopt_long_only (argc, argv, "-a:e:1:2:o:t:v:", longopts, + (int *) 0)) != EOF) + { + switch (optc) + { + case 'a': + val = atoi (optarg); + if (val == 1) + print_unpairables_1 = 1; + else if (val == 2) + print_unpairables_2 = 1; + else + error (2, 0, "invalid file number for `-a'"); + break; + + case 'e': + empty_filler = optarg; + break; + + case '1': + val = atoi (optarg); + if (val <= 0) + error (2, 0, "invalid field number for `-1'"); + join_field_1 = val - 1; + break; + + case '2': + val = atoi (optarg); + if (val <= 0) + error (2, 0, "invalid field number for `-2'"); + join_field_2 = val - 1; + break; + + case 'j': + val = atoi (optarg); + if (val <= 0) + error (2, 0, "invalid field number for `-j'"); + join_field_1 = join_field_2 = val - 1; + break; + + case 'o': + if (add_field_list (optarg) == 0) + error (2, 0, "invalid field list for `-o'"); + break; + + case 't': + tab = *optarg; + break; + + case 'v': + val = atoi (optarg); + if (val == 1) + print_unpairables_1 = 1; + else if (val == 2) + print_unpairables_2 = 1; + else + error (2, 0, "invalid file number for `-v'"); + print_pairables = 0; + break; + + case 1: /* Non-option argument. */ + if (prev_optc == 'o') + { + /* Might be continuation of args to -o. */ + if (add_field_list (optarg) > 0) + continue; /* Don't change `prev_optc'. */ + } + + if (nfiles > 1) + usage (); + names[nfiles++] = optarg; + break; + + case '?': + usage (); + } + prev_optc = optc; + } + + if (nfiles != 2) + usage (); + + fp1 = strcmp (names[0], "-") ? fopen (names[0], "r") : stdin; + if (!fp1) + error (1, errno, "%s", names[0]); + fp2 = strcmp (names[1], "-") ? fopen (names[1], "r") : stdin; + if (!fp2) + error (1, errno, "%s", names[1]); + if (fp1 == fp2) + error (1, errno, "both files cannot be standard input"); + join (fp1, fp2); + + if ((fp1 == stdin || fp2 == stdin) && fclose (stdin) == EOF) + error (1, errno, "-"); + if (ferror (stdout) || fclose (stdout) == EOF) + error (1, 0, "write error"); + + exit (0); +} + +static void +usage () +{ + fprintf (stderr, "\ +Usage: %s [-a 1|2] [-v 1|2] [-e empty-string] [-o field-list...] [-t char]\n\ + [-j[1|2] field] [-1 field] [-2 field] file1 file2\n", + program_name); + exit (1); +} diff --git a/src/nl.c b/src/nl.c new file mode 100644 index 0000000..368001a --- /dev/null +++ b/src/nl.c @@ -0,0 +1,546 @@ +/* nl -- number lines of files + Copyright (C) 1989, 1992 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* Written by Scott Bartram (nancy!scott@uunet.uu.net) + Revised by David MacKenzie (djm@ai.mit.edu) */ + +#include +#include +#include +#include +#include "linebuffer.h" +#include "system.h" + +#ifndef TRUE +#define TRUE 1 +#define FALSE 0 +#endif + +/* Line-number formats. */ +enum number_format +{ + FORMAT_RIGHT_NOLZ, /* Right justified, no leading zeroes. */ + FORMAT_RIGHT_LZ, /* Right justified, leading zeroes. */ + FORMAT_LEFT /* Left justified, no leading zeroes. */ +}; + +/* Default section delimiter characters. */ +#define DEFAULT_SECTION_DELIMITERS "\\:" + +/* Types of input lines: either one of the section delimiters, + or text to output. */ +enum section +{ + Header, Body, Footer, Text +}; + +/* Format of body lines (-b). */ +char *body_type = "t"; + +/* Format of header lines (-h). */ +char *header_type = "n"; + +/* Format of footer lines (-f). */ +char *footer_type = "n"; + +/* Format currently being used (body, header, or footer). */ +char *current_type; + +/* Regex for body lines to number (-bp). */ +struct re_pattern_buffer body_regex; + +/* Regex for header lines to number (-hp). */ +struct re_pattern_buffer header_regex; + +/* Regex for footer lines to number (-fp). */ +struct re_pattern_buffer footer_regex; + +/* Pointer to current regex, if any. */ +struct re_pattern_buffer *current_regex = NULL; + +/* Separator string to print after line number (-s). */ +char *separator_str = "\t"; + +/* Input section delimiter string (-d). */ +char *section_del = DEFAULT_SECTION_DELIMITERS; + +/* Header delimiter string. */ +char *header_del = NULL; + +/* Header section delimiter length. */ +int header_del_len; + +/* Body delimiter string. */ +char *body_del = NULL; + +/* Body section delimiter length. */ +int body_del_len; + +/* Footer delimiter string. */ +char *footer_del = NULL; + +/* Footer section delimiter length. */ +int footer_del_len; + +/* Input buffer. */ +struct linebuffer line_buf; + +/* printf format string for line number. */ +char *print_fmt; + +/* printf format string for unnumbered lines. */ +char *print_no_line_fmt = NULL; + +/* Starting line number on each page (-v). */ +int page_start = 1; + +/* Line number increment (-i). */ +int page_incr = 1; + +/* If TRUE, reset line number at start of each page (-p). */ +int reset_numbers = TRUE; + +/* Number of blank lines to consider to be one line for numbering (-l). */ +int blank_join = 1; + +/* Width of line numbers (-w). */ +int lineno_width = 6; + +/* Line number format (-n). */ +enum number_format lineno_format = FORMAT_RIGHT_NOLZ; + +/* Current print line number. */ +int line_no; + +/* The name this program was run with. */ +char *program_name; + +/* Nonzero if we have ever read standard input. */ +int have_read_stdin; + +enum section check_section (); +char *xmalloc (); +char *xrealloc (); +int build_type_arg (); +int nl_file (); +void usage (); +void process_file (); +void proc_header (); +void proc_body (); +void proc_footer (); +void proc_text (); +void print_lineno (); +void build_print_fmt (); +void error (); + +struct option longopts[] = +{ + {"header-numbering", 1, NULL, 'h'}, + {"body-numbering", 1, NULL, 'b'}, + {"footer-numbering", 1, NULL, 'f'}, + {"first-page", 1, NULL, 'v'}, + {"page-increment", 1, NULL, 'i'}, + {"no-renumber", 0, NULL, 'p'}, + {"join-blank-lines", 1, NULL, 'l'}, + {"number-separator", 1, NULL, 's'}, + {"number-width", 1, NULL, 'w'}, + {"number-format", 1, NULL, 'n'}, + {"section-delimiter", 1, NULL, 'd'}, + {NULL, 0, NULL, 0} +}; + +void +main (argc, argv) + int argc; + char **argv; +{ + int c, exit_status = 0; + + program_name = argv[0]; + have_read_stdin = 0; + + while ((c = getopt_long (argc, argv, "h:b:f:v:i:pl:s:w:n:d:", longopts, + (int *) 0)) != EOF) + { + switch (c) + { + case 'h': + if (build_type_arg (&header_type, &header_regex) != TRUE) + usage (); + break; + case 'b': + if (build_type_arg (&body_type, &body_regex) != TRUE) + usage (); + break; + case 'f': + if (build_type_arg (&footer_type, &footer_regex) != TRUE) + usage (); + break; + case 'v': + page_start = atoi (optarg); + break; + case 'i': + page_incr = atoi (optarg); + if (page_incr < 1) + page_incr = 1; + break; + case 'p': + reset_numbers = FALSE; + break; + case 'l': + blank_join = atoi (optarg); + break; + case 's': + separator_str = optarg; + break; + case 'w': + lineno_width = atoi (optarg); + if (lineno_width < 1) + lineno_width = 1; + break; + case 'n': + switch (*optarg) + { + case 'l': + if (optarg[1] == 'n') + lineno_format = FORMAT_LEFT; + else + usage (); + break; + case 'r': + switch (optarg[1]) + { + case 'n': + lineno_format = FORMAT_RIGHT_NOLZ; + break; + case 'z': + lineno_format = FORMAT_RIGHT_LZ; + break; + default: + usage (); + break; + } + break; + default: + usage (); + break; + } + break; + case 'd': + section_del = optarg; + break; + default: + usage (); + break; + } + } + + /* Initialize the section delimiters. */ + c = strlen (section_del); + + header_del_len = c * 3; + header_del = xmalloc (header_del_len + 1); + strcat (strcat (strcpy (header_del, section_del), section_del), section_del); + + body_del_len = c * 2; + body_del = xmalloc (body_del_len + 1); + strcat (strcpy (body_del, section_del), section_del); + + footer_del_len = c; + footer_del = xmalloc (footer_del_len + 1); + strcpy (footer_del, section_del); + + /* Initialize the input buffer. */ + initbuffer (&line_buf); + + /* Initialize the printf format for unnumbered lines. */ + c = strlen (separator_str); + print_no_line_fmt = xmalloc (lineno_width + c + 1); + memset (print_no_line_fmt, ' ', lineno_width + c); + print_no_line_fmt[lineno_width + c] = '\0'; + + line_no = page_start; + current_type = body_type; + current_regex = &body_regex; + build_print_fmt (); + + /* Main processing. */ + + if (optind == argc) + exit_status |= nl_file ("-"); + else + for (; optind < argc; optind++) + exit_status |= nl_file (argv[optind]); + + if (have_read_stdin && fclose (stdin) == EOF) + { + error (0, errno, "-"); + exit_status = 1; + } + if (ferror (stdout) || fclose (stdout) == EOF) + error (1, 0, "write error"); + + exit (exit_status); +} + +/* Process file FILE to standard output. + Return 0 if successful, 1 if not. */ + +int +nl_file (file) + char *file; +{ + FILE *stream; + + if (!strcmp (file, "-")) + { + have_read_stdin = 1; + stream = stdin; + } + else + { + stream = fopen (file, "r"); + if (stream == NULL) + { + error (0, errno, "%s", file); + return 1; + } + } + + process_file (stream); + + if (ferror (stream)) + { + error (0, errno, "%s", file); + return 1; + } + if (!strcmp (file, "-")) + clearerr (stream); /* Also clear EOF. */ + else if (fclose (stream) == EOF) + { + error (0, errno, "%s", file); + return 1; + } + return 0; +} + +/* Read and process the file pointed to by FP. */ + +void +process_file (fp) + FILE *fp; +{ + while (readline (&line_buf, fp)) + { + switch ((int) check_section ()) + { + case Header: + proc_header (); + break; + case Body: + proc_body (); + break; + case Footer: + proc_footer (); + break; + case Text: + proc_text (); + break; + } + } +} + +/* Return the type of line in `line_buf'. */ + +enum section +check_section () +{ + if (line_buf.length < 2 || memcmp (line_buf.buffer, section_del, 2)) + return Text; + if (line_buf.length == header_del_len + && !memcmp (line_buf.buffer, header_del, header_del_len)) + return Header; + if (line_buf.length == body_del_len + && !memcmp (line_buf.buffer, body_del, body_del_len)) + return Body; + if (line_buf.length == footer_del_len + && !memcmp (line_buf.buffer, footer_del, footer_del_len)) + return Footer; + return Text; +} + +/* Switch to a header section. */ + +void +proc_header () +{ + current_type = header_type; + current_regex = &header_regex; + if (reset_numbers) + line_no = page_start; + putchar ('\n'); +} + +/* Switch to a body section. */ + +void +proc_body () +{ + current_type = body_type; + current_regex = &body_regex; + putchar ('\n'); +} + +/* Switch to a footer section. */ + +void +proc_footer () +{ + current_type = footer_type; + current_regex = &footer_regex; + putchar ('\n'); +} + +/* Process a regular text line in `line_buf'. */ + +void +proc_text () +{ + static int blank_lines = 0; /* Consecutive blank lines so far. */ + + switch (*current_type) + { + case 'a': + if (blank_join > 1) + { + if (line_buf.length || ++blank_lines == blank_join) + { + print_lineno (); + blank_lines = 0; + } + else + printf (print_no_line_fmt); + } + else + print_lineno (); + break; + case 't': + if (line_buf.length) + print_lineno (); + else + printf (print_no_line_fmt); + break; + case 'n': + printf (print_no_line_fmt); + break; + case 'p': + if (re_search (current_regex, line_buf.buffer, line_buf.length, + 0, line_buf.length, (struct re_registers *) 0) < 0) + printf (print_no_line_fmt); + else + print_lineno (); + break; + } + fwrite (line_buf.buffer, sizeof (char), line_buf.length, stdout); + putchar ('\n'); +} + +/* Print and increment the line number. */ + +void +print_lineno () +{ + printf (print_fmt, line_no); + line_no += page_incr; +} + +/* Build the printf format string, based on `lineno_format'. */ + +void +build_print_fmt () +{ + /* 12 = 10 chars for lineno_width, 1 for %, 1 for \0. */ + print_fmt = xmalloc (strlen (separator_str) + 12); + switch (lineno_format) + { + case FORMAT_RIGHT_NOLZ: + sprintf (print_fmt, "%%%dd%s", lineno_width, separator_str); + break; + case FORMAT_RIGHT_LZ: + sprintf (print_fmt, "%%0%dd%s", lineno_width, separator_str); + break; + case FORMAT_LEFT: + sprintf (print_fmt, "%%-%dd%s", lineno_width, separator_str); + break; + } +} + +/* Set the command line flag TYPEP and possibly the regex pointer REGEXP, + according to `optarg'. */ + +int +build_type_arg (typep, regexp) + char **typep; + struct re_pattern_buffer *regexp; +{ + char *errmsg; + int rval = TRUE; + int optlen; + + switch (*optarg) + { + case 'a': + case 't': + case 'n': + *typep = optarg; + break; + case 'p': + *typep = optarg++; + optlen = strlen (optarg); + regexp->allocated = optlen * 2; + regexp->buffer = (unsigned char *) xmalloc (regexp->allocated); + regexp->translate = NULL; + regexp->fastmap = xmalloc (256); + regexp->fastmap_accurate = 0; + errmsg = re_compile_pattern (optarg, optlen, regexp); + if (errmsg) + error (1, 0, "%s", errmsg); + break; + default: + rval = FALSE; + break; + } + return rval; +} + +/* Print a usage message and quit. */ + +void +usage () +{ + fprintf (stderr, "\ +Usage: %s [-h header-style] [-b body-style] [-f footer-style] [-p] [-d cc]\n\ + [-v start-number] [-i increment] [-l lines] [-s line-separator]\n\ + [-w line-no-width] [-n {ln,rn,rz}] [--header-numbering=style]\n\ + [--body-numbering=style] [--footer-numbering=style]\n\ + [--first-page=number] [--page-increment=number] [--no-renumber]\n\ + [--join-blank-lines=number] [--number-separator=string]\n\ + [--number-width=number] [--number-format={ln,rn,rz}]\n\ + [--section-delimiter=cc] [file...]\n", + program_name); + exit (2); +} diff --git a/src/od.c b/src/od.c new file mode 100644 index 0000000..f13c6b7 --- /dev/null +++ b/src/od.c @@ -0,0 +1,1697 @@ +/* od -- dump in octal (and other formats) the contents of files + Copyright (C) 1992 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* Written by Jim Meyering. */ + +/* AIX requires this to be the first thing in the file. */ +#ifdef __GNUC__ +#define alloca __builtin_alloca +#else /* not __GNUC__ */ +#if HAVE_ALLOCA_H +#include +#else /* not HAVE_ALLOCA_H */ +#ifdef _AIX + #pragma alloca +#else /* not _AIX */ +char *alloca (); +#endif /* not _AIX */ +#endif /* not HAVE_ALLOCA_H */ +#endif /* not __GNUC__ */ + +#include +#include +#include +#include +#include +#include "system.h" + +#if defined(__GNUC__) || defined(STDC_HEADERS) +#include +#endif + +#ifdef __GNUC__ +typedef long double LONG_DOUBLE; +#else +typedef double LONG_DOUBLE; +#endif + +#if HAVE_LIMITS_H +#include +#endif +#ifndef SCHAR_MAX +#define SCHAR_MAX 127 +#endif +#ifndef SHRT_MAX +#define SHRT_MAX 32767 +#endif +#ifndef ULONG_MAX +#define ULONG_MAX ((unsigned long) ~(unsigned long) 0) +#endif + +#define STREQ(a,b) (strcmp((a), (b)) == 0) + +#ifndef MAX +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#endif + +#ifndef MIN +#define MIN(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +/* The default number of input bytes per output line. */ +#define DEFAULT_BYTES_PER_BLOCK 16 + +/* The number of decimal digits of precision in a float. */ +#ifndef FLT_DIG +#define FLT_DIG 7 +#endif + +/* The number of decimal digits of precision in a double. */ +#ifndef DBL_DIG +#define DBL_DIG 15 +#endif + +/* The number of decimal digits of precision in a long double. */ +#ifndef LDBL_DIG +#define LDBL_DIG DBL_DIG +#endif + +char *xmalloc (); +char *xrealloc (); +void error (); + +enum size_spec +{ + NO_SIZE, + CHAR, + SHORT, + INT, + LONG, + FP_SINGLE, + FP_DOUBLE, + FP_LONG_DOUBLE +}; + +enum output_format +{ + SIGNED_DECIMAL, + UNSIGNED_DECIMAL, + OCTAL, + HEXADECIMAL, + FLOATING_POINT, + NAMED_CHARACTER, + CHARACTER +}; + +enum strtoul_error +{ + UINT_OK, UINT_INVALID, UINT_INVALID_SUFFIX_CHAR, UINT_OVERFLOW +}; +typedef enum strtoul_error strtoul_error; + +/* Each output format specification (from POSIX `-t spec' or from + old-style options) is represented by one of these structures. */ +struct tspec +{ + enum output_format fmt; + enum size_spec size; + void (*print_function) (); + char *fmt_string; +}; + +/* Convert the number of 8-bit bytes of a binary representation to + the number of characters (digits + sign if the type is signed) + required to represent the same quantity in the specified base/type. + For example, a 32-bit (4-byte) quantity may require a field width + as wide as the following for these types: + 11 unsigned octal + 11 signed decimal + 10 unsigned decimal + 8 unsigned hexadecimal */ + +static const unsigned int bytes_to_oct_digits[] = +{0, 3, 6, 8, 11, 14, 16, 19, 22, 25, 27, 30, 32, 35, 38, 41, 43}; + +static const unsigned int bytes_to_signed_dec_digits[] = +{1, 4, 6, 8, 11, 13, 16, 18, 20, 23, 25, 28, 30, 33, 35, 37, 40}; + +static const unsigned int bytes_to_unsigned_dec_digits[] = +{0, 3, 5, 8, 10, 13, 15, 17, 20, 22, 25, 27, 29, 32, 34, 37, 39}; + +static const unsigned int bytes_to_hex_digits[] = +{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32}; + +/* Convert enum size_spec to the size of the named type. */ +static const int width_bytes[] = +{ + -1, + sizeof (char), + sizeof (short int), + sizeof (int), + sizeof (long int), + sizeof (float), + sizeof (double), + sizeof (LONG_DOUBLE) +}; + +/* Names for some non-printing characters. */ +static const char *const charname[33] = +{ + "nul", "soh", "stx", "etx", "eot", "enq", "ack", "bel", + "bs", "ht", "nl", "vt", "ff", "cr", "so", "si", + "dle", "dc1", "dc2", "dc3", "dc4", "nak", "syn", "etb", + "can", "em", "sub", "esc", "fs", "gs", "rs", "us", + "sp" +}; + +/* A printf control string for printing a file offset. */ +static const char *output_address_fmt_string; + +/* FIXME: make this the number of octal digits in an unsigned long. */ +#define MAX_ADDRESS_LENGTH 13 +static char address_fmt_buffer[MAX_ADDRESS_LENGTH + 1]; +static char address_pad[MAX_ADDRESS_LENGTH + 1]; + +static unsigned long int string_min; +static unsigned long int flag_dump_strings; + +/* The number of input bytes to skip before formatting and writing. */ +static unsigned long int n_bytes_to_skip = 0; + +/* When non-zero, MAX_BYTES_TO_FORMAT is the maximum number of bytes + to be read and formatted. Otherwise all input is formatted. */ +static int limit_bytes_to_format = 0; + +/* The maximum number of bytes that will be formatted. This + value is used only when LIMIT_BYTES_TO_FORMAT is non-zero. */ +static unsigned long int max_bytes_to_format; + +/* When non-zero and two or more consecutive blocks are equal, format + only the first block and output an asterisk alone on the following + line to indicate that identical blocks have been elided. */ +static int abbreviate_duplicate_blocks = 1; + +/* An array of specs describing how to format each input block. */ +static struct tspec *spec; + +/* The number of format specs. */ +static unsigned int n_specs; + +/* The allocated length of SPEC. */ +static unsigned int n_specs_allocated; + +/* The number of input bytes formatted per output line. It must be + a multiple of the least common multiple of the sizes associated with + the specified output types. It should be as large as possible, but + no larger than 16 -- unless specified with the -w option. */ +static unsigned int bytes_per_block; + +/* Human-readable representation of *file_list (for error messages). + It differs from *file_list only when *file_list is "-". */ +static char const *input_filename; + +/* A NULL-terminated list of the file-arguments from the command line. + If no file-arguments were specified, this variable is initialized + to { "-", NULL }. */ +static char const *const *file_list; + +/* The input stream associated with the current file. */ +static FILE *in_stream; + +#define LONGEST_INTEGRAL_TYPE long int + +#define MAX_INTEGRAL_TYPE_SIZE sizeof(LONGEST_INTEGRAL_TYPE) +static enum size_spec integral_type_size[MAX_INTEGRAL_TYPE_SIZE + 1]; + +#define MAX_FP_TYPE_SIZE sizeof(LONG_DOUBLE) +static enum size_spec fp_type_size[MAX_FP_TYPE_SIZE + 1]; + +static struct option long_options[] = +{ + /* POSIX options. */ + {"skip-bytes", 1, NULL, 'j'}, + {"address-radix", 1, NULL, 'A'}, + {"read-bytes", 1, NULL, 'N'}, + {"format", 1, NULL, 't'}, + {"output-duplicates", 0, NULL, 'v'}, + + /* non-POSIX options. */ + {"strings", 2, NULL, 's'}, + {"width", 2, NULL, 'w'}, + {NULL, 0, NULL, 0} +}; + +/* The name this program was run with. */ +char *program_name; + +static void +usage () +{ + fprintf (stderr, "\ +Usage: %s [-abcdfhiloxv] [-s[bytes]] [-w[bytes]] [-A radix] [-j bytes]\n\ + [-N bytes] [-t type] [--skip-bytes=bytes] [--address-radix=radix]\n\ + [--read-bytes=bytes] [--format=type] [--output-duplicates]\n\ + [--strings[=bytes]] [--width[=bytes]] [file...]\n", + program_name); + exit (1); +} + +/* Compute the greatest common denominator of U and V + using Euclid's algorithm. */ + +static unsigned int +gcd (u, v) + unsigned int u; + unsigned int v; +{ + unsigned int t; + while (v != 0) + { + t = u % v; + u = v; + v = t; + } + return u; +} + +/* Compute the least common multiple of U and V. */ + +static unsigned int +lcm (u, v) + unsigned int u; + unsigned int v; +{ + unsigned int t = gcd (u, v); + if (t == 0) + return 0; + return u * v / t; +} + +static strtoul_error +my_strtoul (s, base, val, allow_bkm_suffix) + const char *s; + int base; + long unsigned int *val; + int allow_bkm_suffix; +{ + char *p; + unsigned long int tmp; + + assert (0 <= base && base <= 36); + + tmp = strtoul (s, &p, base); + if (errno != 0) + return UINT_OVERFLOW; + if (p == s) + return UINT_INVALID; + if (!allow_bkm_suffix) + { + if (*p == '\0') + { + *val = tmp; + return UINT_OK; + } + else + return UINT_INVALID_SUFFIX_CHAR; + } + + switch (*p) + { + case '\0': + break; + +#define BKM_SCALE(x,scale_factor) \ + do \ + { \ + if (x > (double) ULONG_MAX / scale_factor) \ + return UINT_OVERFLOW; \ + x *= scale_factor; \ + } \ + while (0) + + case 'b': + BKM_SCALE (tmp, 512); + break; + + case 'k': + BKM_SCALE (tmp, 1024); + break; + + case 'm': + BKM_SCALE (tmp, 1024 * 1024); + break; + + default: + return UINT_INVALID_SUFFIX_CHAR; + break; + } + + *val = tmp; + return UINT_OK; +} + +static void +uint_fatal_error (str, argument_type_string, err) + const char *str; + const char *argument_type_string; + strtoul_error err; +{ + switch (err) + { + case UINT_OK: + abort (); + + case UINT_INVALID: + error (2, 0, "invalid %s `%s'", argument_type_string, str); + break; + + case UINT_INVALID_SUFFIX_CHAR: + error (2, 0, "invalid character following %s `%s'", + argument_type_string, str); + break; + + case UINT_OVERFLOW: + error (2, 0, "%s `%s' larger than maximum unsigned long", + argument_type_string, str); + break; + } +} + +static void +print_s_char (n_bytes, block, fmt_string) + long unsigned int n_bytes; + const char *block; + const char *fmt_string; +{ + int i, err; + err = 0; + for (i = n_bytes; i > 0; i--) + { + int tmp = (unsigned) *(unsigned char *) block; + if (tmp > SCHAR_MAX) + tmp = (SCHAR_MAX - tmp); + assert (tmp <= SCHAR_MAX); + err |= (printf (fmt_string, tmp, (i == 1 ? '\n' : ' ')) == EOF); + block += sizeof (unsigned char); + } + if (err) + error (2, errno, "standard output"); +} + +static void +print_char (n_bytes, block, fmt_string) + long unsigned int n_bytes; + const char *block; + const char *fmt_string; +{ + int i, err; + err = 0; + for (i = n_bytes; i > 0; i--) + { + unsigned int tmp = *(unsigned char *) block; + err |= (printf (fmt_string, tmp, (i == 1 ? '\n' : ' ')) == EOF); + block += sizeof (unsigned char); + } + if (err) + error (2, errno, "standard output"); +} + +static void +print_s_short (n_bytes, block, fmt_string) + long unsigned int n_bytes; + const char *block; + const char *fmt_string; +{ + int i, err; + err = 0; + for (i = n_bytes / sizeof (unsigned short); i > 0; i--) + { + int tmp = (unsigned) *(unsigned short *) block; + if (tmp > SHRT_MAX) + tmp = (SHRT_MAX - tmp); + assert (tmp <= SHRT_MAX); + err |= (printf (fmt_string, tmp, (i == 1 ? '\n' : ' ')) == EOF); + block += sizeof (unsigned short); + } + if (err) + error (2, errno, "standard output"); +} + +static void +print_short (n_bytes, block, fmt_string) + long unsigned int n_bytes; + const char *block; + const char *fmt_string; +{ + int i, err; + err = 0; + for (i = n_bytes / sizeof (unsigned short); i > 0; i--) + { + unsigned int tmp = *(unsigned short *) block; + err |= (printf (fmt_string, tmp, (i == 1 ? '\n' : ' ')) == EOF); + block += sizeof (unsigned short); + } + if (err) + error (2, errno, "standard output"); +} + +static void +print_int (n_bytes, block, fmt_string) + long unsigned int n_bytes; + const char *block; + const char *fmt_string; +{ + int i, err; + err = 0; + for (i = n_bytes / sizeof (unsigned int); i > 0; i--) + { + unsigned int tmp = *(unsigned int *) block; + err |= (printf (fmt_string, tmp, (i == 1 ? '\n' : ' ')) == EOF); + block += sizeof (unsigned int); + } + if (err) + error (2, errno, "standard output"); +} + +static void +print_long (n_bytes, block, fmt_string) + long unsigned int n_bytes; + const char *block; + const char *fmt_string; +{ + int i, err; + err = 0; + for (i = n_bytes / sizeof (unsigned long); i > 0; i--) + { + unsigned long tmp = *(unsigned long *) block; + err |= (printf (fmt_string, tmp, (i == 1 ? '\n' : ' ')) == EOF); + block += sizeof (unsigned long); + } + if (err) + error (2, errno, "standard output"); +} + +static void +print_float (n_bytes, block, fmt_string) + long unsigned int n_bytes; + const char *block; + const char *fmt_string; +{ + int i, err; + err = 0; + for (i = n_bytes / sizeof (float); i > 0; i--) + { + float tmp = *(float *) block; + err |= (printf (fmt_string, tmp, (i == 1 ? '\n' : ' ')) == EOF); + block += sizeof (float); + } + if (err) + error (2, errno, "standard output"); +} + +static void +print_double (n_bytes, block, fmt_string) + long unsigned int n_bytes; + const char *block; + const char *fmt_string; +{ + int i, err; + err = 0; + for (i = n_bytes / sizeof (double); i > 0; i--) + { + double tmp = *(double *) block; + err |= (printf (fmt_string, tmp, (i == 1 ? '\n' : ' ')) == EOF); + block += sizeof (double); + } + if (err) + error (2, errno, "standard output"); +} + +#ifdef __GNUC__ +static void +print_long_double (n_bytes, block, fmt_string) + long unsigned int n_bytes; + const char *block; + const char *fmt_string; +{ + int i, err; + err = 0; + for (i = n_bytes / sizeof (LONG_DOUBLE); i > 0; i--) + { + LONG_DOUBLE tmp = *(LONG_DOUBLE *) block; + err |= (printf (fmt_string, tmp, (i == 1 ? '\n' : ' ')) == EOF); + block += sizeof (LONG_DOUBLE); + } + if (err) + error (2, errno, "standard output"); +} + +#endif + +static void +print_named_ascii (n_bytes, block, unused_fmt_string) + long unsigned int n_bytes; + const char *block; + const char *unused_fmt_string; +{ + int i; + for (i = n_bytes; i > 0; i--) + { + unsigned int c = *(unsigned char *) block; + unsigned int masked_c = (0x7f & c); + const char *s; + char buf[5]; + + if (masked_c == 127) + s = "del"; + else if (masked_c <= 040) + s = charname[masked_c]; + else + { + sprintf (buf, " %c", masked_c); + s = buf; + } + + if (printf ("%3s%c", s, (i == 1 ? '\n' : ' ')) == EOF) + error (2, errno, "standard output"); + block += sizeof (unsigned char); + } +} + +static void +print_ascii (n_bytes, block, unused_fmt_string) + long unsigned int n_bytes; + const char *block; + const char *unused_fmt_string; +{ + int i; + for (i = n_bytes; i > 0; i--) + { + unsigned int c = *(unsigned char *) block; + const char *s; + char buf[5]; + + switch (c) + { + case '\0': + s = " \\0"; + break; + + case '\007': + s = " \\a"; + break; + + case '\b': + s = " \\b"; + break; + + case '\f': + s = " \\f"; + break; + + case '\n': + s = " \\n"; + break; + + case '\r': + s = " \\r"; + break; + + case '\t': + s = " \\t"; + break; + + case '\v': + s = " \\v"; + break; + + default: + sprintf (buf, (isprint (c) ? " %c" : "%03o"), c); + s = (const char *) buf; + } + + if (printf ("%3s%c", s, (i == 1 ? '\n' : ' ')) == EOF) + error (2, errno, "standard output"); + block += sizeof (unsigned char); + } +} + +/* Convert a null-terminated (possibly zero-length) string S to an + unsigned long integer value. If S points to a non-digit set *P to S, + *VAL to 0, and return 0. Otherwise, accumulate the integer value of + the string of digits. If the string of digits represents a value + larger than ULONG_MAX, don't modify *VAL or *P and return non-zero. + Otherwise, advance *P to the first non-digit after S, set *VAL to + the result of the conversion and return zero. */ + +static int +simple_strtoul (s, p, val) + const char *s; + const char **p; + long unsigned int *val; +{ + unsigned long int sum; + + sum = 0; + while (isdigit (*s)) + { + unsigned int c = *s++ - '0'; + if (sum > (ULONG_MAX - c) / 10) + return 1; + sum = sum * 10 + c; + } + *p = s; + *val = sum; + return 0; +} + +/* If S points to a single valid POSIX-style od format string, put a + description of that format in *TSPEC, make *NEXT point at the character + following the just-decoded format (if *NEXT is non-NULL), and return + zero. If S is not valid, don't modify *NEXT or *TSPEC and return + non-zero. For example, if S were "d4afL" *NEXT would be set to "afL" + and *TSPEC would be + { + fmt = SIGNED_DECIMAL; + size = INT or LONG; (whichever integral_type_size[4] resolves to) + print_function = print_int; (assuming size == INT) + fmt_string = "%011d%c"; + } +*/ + +static int +decode_one_format (s, next, tspec) + const char *s; + const char **next; + struct tspec *tspec; +{ + enum size_spec size_spec; + unsigned long int size; + enum output_format fmt; + const char *pre_fmt_string; + char *fmt_string; + void (*print_function) (); + const char *p; + unsigned int c; + + assert (tspec != NULL); + + switch (*s) + { + case 'd': + case 'o': + case 'u': + case 'x': + c = *s; + ++s; + switch (*s) + { + case 'C': + ++s; + size = sizeof (char); + break; + + case 'S': + ++s; + size = sizeof (short); + break; + + case 'I': + ++s; + size = sizeof (int); + break; + + case 'L': + ++s; + size = sizeof (long int); + break; + + default: + if (simple_strtoul (s, &p, &size) != 0) + return 1; + if (p == s) + size = sizeof (int); + else + { + if (size > MAX_INTEGRAL_TYPE_SIZE + || integral_type_size[size] == NO_SIZE) + return 1; + s = p; + } + break; + } + +#define FMT_BYTES_ALLOCATED 9 + fmt_string = xmalloc (FMT_BYTES_ALLOCATED); + + size_spec = integral_type_size[size]; + + switch (c) + { + case 'd': + fmt = SIGNED_DECIMAL; + sprintf (fmt_string, "%%0%u%sd%%c", + bytes_to_signed_dec_digits[size], + (size_spec == LONG ? "l" : "")); + break; + + case 'o': + fmt = OCTAL; + sprintf (fmt_string, "%%0%u%so%%c", + bytes_to_oct_digits[size], + (size_spec == LONG ? "l" : "")); + break; + + case 'u': + fmt = UNSIGNED_DECIMAL; + sprintf (fmt_string, "%%0%u%su%%c", + bytes_to_unsigned_dec_digits[size], + (size_spec == LONG ? "l" : "")); + break; + + case 'x': + fmt = HEXADECIMAL; + sprintf (fmt_string, "%%0%u%sx%%c", + bytes_to_hex_digits[size], + (size_spec == LONG ? "l" : "")); + break; + + default: + abort (); + } + + assert (strlen (fmt_string) < FMT_BYTES_ALLOCATED); + + switch (size_spec) + { + case CHAR: + print_function = (fmt == SIGNED_DECIMAL + ? print_s_char + : print_char); + break; + + case SHORT: + print_function = (fmt == SIGNED_DECIMAL + ? print_s_short + : print_short);; + break; + + case INT: + print_function = print_int; + break; + + case LONG: + print_function = print_long; + break; + + default: + abort (); + } + break; + + case 'f': + fmt = FLOATING_POINT; + ++s; + switch (*s) + { + case 'F': + ++s; + size = sizeof (float); + break; + + case 'D': + ++s; + size = sizeof (double); + break; + + case 'L': + ++s; + size = sizeof (LONG_DOUBLE); + break; + + default: + if (simple_strtoul (s, &p, &size) != 0) + return 1; + if (p == s) + size = sizeof (double); + else + { + if (size > MAX_FP_TYPE_SIZE + || fp_type_size[size] == NO_SIZE) + return 1; + s = p; + } + break; + } + size_spec = fp_type_size[size]; + + switch (size_spec) + { + case FP_SINGLE: + print_function = print_float; + pre_fmt_string = "%%%d.%d#e%%c"; + fmt_string = xmalloc (strlen (pre_fmt_string)); + sprintf (fmt_string, pre_fmt_string, + FLT_DIG + 8, FLT_DIG); + break; + + case FP_DOUBLE: + print_function = print_double; + pre_fmt_string = "%%%d.%d#e%%c"; + fmt_string = xmalloc (strlen (pre_fmt_string)); + sprintf (fmt_string, pre_fmt_string, + DBL_DIG + 8, DBL_DIG); + break; + +#ifdef __GNUC__ + case FP_LONG_DOUBLE: + print_function = print_long_double; + pre_fmt_string = "%%%d.%d#le%%c"; + fmt_string = xmalloc (strlen (pre_fmt_string)); + sprintf (fmt_string, pre_fmt_string, + LDBL_DIG + 8, LDBL_DIG); + break; +#endif + + default: + abort (); + } + break; + + case 'a': + ++s; + fmt = NAMED_CHARACTER; + size_spec = CHAR; + fmt_string = NULL; + print_function = print_named_ascii; + break; + + case 'c': + ++s; + fmt = CHARACTER; + size_spec = CHAR; + fmt_string = NULL; + print_function = print_ascii; + break; + + default: + return 1; + } + + tspec->size = size_spec; + tspec->fmt = fmt; + tspec->print_function = print_function; + tspec->fmt_string = fmt_string; + + if (next != NULL) + *next = s; + + return 0; +} + +/* Decode the POSIX-style od format string S. Append the decoded + representation to the global array SPEC, reallocating SPEC if + necessary. Return zero if S is valid, non-zero otherwise. */ + +static int +decode_format_string (s) + const char *s; +{ + assert (s != NULL); + + while (*s != '\0') + { + struct tspec tspec; + const char *next; + + if (decode_one_format (s, &next, &tspec)) + return 1; + + assert (s != next); + s = next; + + if (n_specs >= n_specs_allocated) + { + n_specs_allocated = 1 + (3 * n_specs_allocated) / 2; + spec = (struct tspec *) xrealloc (spec, (n_specs_allocated + * sizeof (struct tspec))); + } + + bcopy ((char *) &tspec, (char *) &spec[n_specs], sizeof (struct tspec)); + ++n_specs; + } + + return 0; +} + +/* Given a list of one or more input filenames FILE_LIST, set the global + file pointer IN_STREAM to position N_SKIP in the concatenation of + those files. If any file operation fails or if there are fewer than + N_SKIP bytes in the combined input, give an error message and exit. + When possible, use seek- rather than read operations to advance + IN_STREAM. A file name of "-" is interpreted as standard input. */ + +static void +skip (n_skip) + long unsigned int n_skip; +{ + for ( /*empty */ ; *file_list != NULL; ++file_list) + { + struct stat file_stats; + int j; + + if (STREQ (*file_list, "-")) + { + input_filename = "standard input"; + in_stream = stdin; + } + else + { + input_filename = *file_list; + in_stream = fopen (input_filename, "r"); + if (in_stream == NULL) + error (2, errno, "%s", input_filename); + } + + if (n_skip == 0) + break; + + /* First try using fseek. For large offsets, all this work is + worthwhile. If the offset is below some threshold it may be + more efficient to move the pointer by reading. There are two + issues when trying to use fseek: + - the file must be seekable. + - before seeking to the specified position, make sure + that the new position is in the current file. + Try to do that by getting file's size using stat(). + But that will work only for regular files and dirs. */ + + if (fstat (fileno (in_stream), &file_stats)) + error (2, errno, "%s", input_filename); + + /* The st_size field is valid only for regular files and + directories. FIXME: is the preceding true? + If the number of bytes left to skip is at least as large as + the size of the current file, we can decrement + n_skip and go on to the next file. */ + if (S_ISREG (file_stats.st_mode) || S_ISDIR (file_stats.st_mode)) + { + if (n_skip >= file_stats.st_size) + { + n_skip -= file_stats.st_size; + if (in_stream != stdin) + { + if (fclose (in_stream)) + error (2, errno, "%s", input_filename); + } + continue; + } + else + { + if (fseek (in_stream, n_skip, SEEK_SET) == 0) + { + n_skip = 0; + break; + } + } + } + + /* fseek didn't work or wasn't attempted; do it the slow way. */ + + for (j = n_skip / BUFSIZ; j >= 0; j--) + { + char buf[BUFSIZ]; + size_t n_bytes_to_read = (j > 0 + ? BUFSIZ + : n_skip % BUFSIZ); + size_t n_bytes_read; + n_bytes_read = fread (buf, 1, n_bytes_to_read, in_stream); + n_skip -= n_bytes_read; + if (n_bytes_read != n_bytes_to_read) + { + if (ferror (in_stream)) + error (2, errno, "%s", input_filename); + else + break; + } + } + + if (n_skip == 0) + break; + } + + if (n_skip != 0) + error (2, 0, "cannot skip past end of combined input"); +} + +static const char * +format_address (address) + long unsigned int address; +{ + const char *address_string; + + if (output_address_fmt_string == NULL) + address_string = ""; + else + { + sprintf (address_fmt_buffer, output_address_fmt_string, address); + address_string = address_fmt_buffer; + } + return address_string; +} + +/* Write N_BYTES bytes from CURR_BLOCK to standard output once for each + of the N_SPEC format specs. CURRENT_OFFSET is the byte address of + CURR_BLOCK in the concatenation of input files, and it is printed + (optionally) only before the output line associated with the first + format spec. When duplicate blocks are being abbreviated, the output + for a sequence of identical input blocks is the output for the first + block followed by an asterisk alone on a line. It is valid to compare + the blocks PREV_BLOCK and CURR_BLOCK only when N_BYTES == BYTES_PER_BLOCK. + That condition may be false only for the last input block -- and then + only when it has not been padded to length BYTES_PER_BLOCK. */ + +static void +write_block (current_offset, n_bytes, prev_block, curr_block) + long unsigned int current_offset; + long unsigned int n_bytes; + const char *prev_block; + const char *curr_block; +{ + static int first = 1; + static int prev_pair_equal = 0; + +#define EQUAL_BLOCKS(b1, b2) (bcmp ((b1), (b2), bytes_per_block) == 0) + + if (abbreviate_duplicate_blocks + && !first && n_bytes == bytes_per_block + && EQUAL_BLOCKS (prev_block, curr_block)) + { + if (prev_pair_equal) + { + /* The two preceding blocks were equal, and the current + block is the same as the last one, so print nothing. */ + } + else + { + printf ("*\n"); + prev_pair_equal = 1; + } + } + else + { + int i; + + prev_pair_equal = 0; + for (i = 0; i < n_specs; i++) + { + if (printf ("%s ", (i == 0 + ? format_address (current_offset) + : address_pad)) + == EOF) + error (2, errno, "standard output"); + (*spec[i].print_function) (n_bytes, curr_block, spec[i].fmt_string); + } + } + first = 0; +} + +/* Read and return a single byte from the concatenation of the input + files named in the global array FILE_LIST. On the first call to this + function, the global variable IN_STREAM is expected to be an open + stream associated with the input file *FILE_LIST. If IN_STREAM is + at end-of-file, close it and update the global variables IN_STREAM, + FILE_LIST, and INPUT_FILENAME so they correspond to the next file in + the list. Then try to read a byte from the newly opened file. + Repeat if necessary until *FILE_LIST is NULL. Upon any read-, open-, + or close error give a message and exit. When EOF is reached for the + last file in FILE_LIST, return EOF. Any subsequent calls return EOF. */ + +static int +read_char () +{ + if (*file_list == NULL) + return EOF; + + while (1) + { + int c; + + c = fgetc (in_stream); + + if (c != EOF) + return c; + + if (errno != 0) + error (2, errno, "%s", input_filename); + + if (in_stream != stdin) + if (fclose (in_stream) == EOF) + error (2, errno, "%s", input_filename); + + ++file_list; + if (*file_list == NULL) + return EOF; + + if (STREQ (*file_list, "-")) + { + input_filename = "standard input"; + in_stream = stdin; + } + else + { + input_filename = *file_list; + in_stream = fopen (input_filename, "r"); + if (in_stream == NULL) + error (2, errno, "%s", input_filename); + } + } +} + +/* Read N bytes into BLOCK from the concatenation of the input files + named in the global array FILE_LIST. On the first call to this + function, the global variable IN_STREAM is expected to be an open + stream associated with the input file *FILE_LIST. On subsequent + calls, if *FILE_LIST is NULL, don't modify BLOCK and return zero. + If all N bytes cannot be read from IN_STREAM, close IN_STREAM and + update the global variables IN_STREAM, FILE_LIST, and INPUT_FILENAME. + Then try to read the remaining bytes from the newly opened file. + Repeat if necessary until *FILE_LIST is NULL. Upon any read-, open-, + or close error give a message and exit. Otherwise, return the number + of bytes read. */ + +static unsigned long int +read_block (n, block) + size_t n; + char *block; +{ + unsigned long int n_bytes_in_buffer; + + assert (n > 0 && n <= bytes_per_block); + if (n == 0) + return 0; + + n_bytes_in_buffer = 0; + + if (*file_list == NULL) + return 0; /* EOF. */ + + while (1) + { + size_t n_needed; + size_t n_read; + + n_needed = n - n_bytes_in_buffer; + n_read = fread (block + n_bytes_in_buffer, 1, n_needed, in_stream); + + if (ferror (in_stream)) + error (2, errno, "%s", input_filename); + + if (n_read == n_needed) + return n; + + n_bytes_in_buffer += n_read; + + if (in_stream != stdin) + if (fclose (in_stream) == EOF) + error (2, errno, "%s", input_filename); + + ++file_list; + if (*file_list == NULL) + return n_bytes_in_buffer; + + if (STREQ (*file_list, "-")) + { + input_filename = "standard input"; + in_stream = stdin; + } + else + { + input_filename = *file_list; + in_stream = fopen (input_filename, "r"); + if (in_stream == NULL) + error (2, errno, "%s", input_filename); + } + } +} + +/* Return the least common multiple of the sizes associated + with the format specs. */ + +static int +get_lcm () +{ + int i; + int l_c_m = 1; + + for (i = 0; i < n_specs; i++) + l_c_m = lcm (l_c_m, width_bytes[(int) spec[i].size]); + return l_c_m; +} + +/* Read chunks of size BYTES_PER_BLOCK from the input files, write the + formatted block to standard output, and repeat until the specified + maximum number of bytes has been read or until all input has been + processed. If the last block read is smaller than BYTES_PER_BLOCK + and its size is not a multiple of the size associated with a format + spec, extend the input block with zero bytes until its length is a + multiple of all format spec sizes. Write the final block. Finally, + write on a line by itself the offset of the byte after the last byte + read. */ + +static void +dump () +{ + char *block[2]; + unsigned long int current_offset; + int idx = 0; + size_t n_bytes_read; + + block[0] = (char *) alloca (bytes_per_block); + block[1] = (char *) alloca (bytes_per_block); + + current_offset = n_bytes_to_skip; + + if (limit_bytes_to_format) + { + size_t end_offset = n_bytes_to_skip + max_bytes_to_format; + + n_bytes_read = 0; + while (current_offset < end_offset) + { + size_t n_needed; + n_needed = MIN (end_offset - current_offset, bytes_per_block); + n_bytes_read = read_block (n_needed, block[idx]); + if (n_bytes_read < bytes_per_block) + break; + assert (n_bytes_read == bytes_per_block); + write_block (current_offset, n_bytes_read, + block[!idx], block[idx]); + current_offset += n_bytes_read; + idx = !idx; + } + } + else + { + while (1) + { + n_bytes_read = read_block (bytes_per_block, block[idx]); + if (n_bytes_read < bytes_per_block) + break; + assert (n_bytes_read == bytes_per_block); + write_block (current_offset, n_bytes_read, + block[!idx], block[idx]); + current_offset += n_bytes_read; + idx = !idx; + } + } + + if (n_bytes_read > 0) + { + int l_c_m; + size_t bytes_to_write; + + l_c_m = get_lcm (); + + /* Make bytes_to_write the smallest multiple of l_c_m that + is at least as large as n_bytes_read. */ + bytes_to_write = l_c_m * (int) ((n_bytes_read + l_c_m - 1) / l_c_m); + + bzero (block[idx] + n_bytes_read, bytes_to_write - n_bytes_read); + write_block (current_offset, bytes_to_write, + block[!idx], block[idx]); + current_offset += n_bytes_read; + } + + if (output_address_fmt_string != NULL) + { + if (printf ("%s\n", format_address (current_offset)) == EOF) + error (2, errno, "standard output"); + } +} + +/* STRINGS mode. Find each "string constant" in the file. + A string constant is a run of at least `string_min' ASCII graphic + (or formatting) characters terminated by a null. Based on a + function written by Richard Stallman for a pre-POSIX + version of od. */ + +static void +dump_strings () +{ + int bufsize = MAX (100, string_min); + char *buf = xmalloc (bufsize); + unsigned long address = n_bytes_to_skip; + + while (1) + { + int i; + int c; + + /* See if the next `string_min' chars are all printing chars. */ + tryline: + + if (limit_bytes_to_format + && address >= (n_bytes_to_skip + max_bytes_to_format - string_min)) + break; + + for (i = 0; i < string_min; i++) + { + c = read_char (); + address++; + if (c < 0) + return; + if (!isprint (c)) + /* Found a non-printing. Try again starting with next char. */ + goto tryline; + buf[i] = c; + } + + /* We found a run of `string_min' printable characters. + Now see if it is terminated with a null byte. */ + while (!limit_bytes_to_format + || address < n_bytes_to_skip + max_bytes_to_format) + { + if (i == bufsize) + { + bufsize = 1 + 3 * bufsize / 2; + buf = xrealloc (buf, bufsize); + } + c = read_char (); + address++; + if (c < 0) + return; + if (c == '\0') + break; /* It is; print this string. */ + if (!isprint (c)) + goto tryline; /* It isn't; give up on this string. */ + buf[i++] = c; /* String continues; store it all. */ + } + + /* If we get here, the string is all printable and null-terminated, + so print it. It is all in `buf' and `i' is its length. */ + buf[i] = 0; + if (output_address_fmt_string != NULL) + { + if (printf ("%s ", format_address (address - i - 1)) == EOF) + error (2, errno, "standard output"); + } + for (i = 0; (c = buf[i]); i++) + { + int err; + switch (c) + { + case '\007': + err = fputs ("\\a", stdout); + break; + + case '\b': + err = fputs ("\\b", stdout); + break; + + case '\f': + err = fputs ("\\f", stdout); + break; + + case '\n': + err = fputs ("\\n", stdout); + break; + + case '\r': + err = fputs ("\\r", stdout); + break; + + case '\t': + err = fputs ("\\t", stdout); + break; + + case '\v': + err = fputs ("\\v", stdout); + break; + + default: + err = putchar (c); + } + if (err == EOF) + error (2, errno, "standard output"); + } + if (putchar ('\n') == EOF) + error (2, errno, "standard output"); + } + free (buf); +} + +void +main (argc, argv) + int argc; + char **argv; +{ + int c; + int n_files; + int i; + unsigned int l_c_m; + unsigned int address_pad_len; + unsigned long int desired_width; + int width_specified = 0; + + program_name = argv[0]; + + for (i = 0; i <= MAX_INTEGRAL_TYPE_SIZE; i++) + integral_type_size[i] = NO_SIZE; + + integral_type_size[sizeof (char)] = CHAR; + integral_type_size[sizeof (short int)] = SHORT; + integral_type_size[sizeof (int)] = INT; + integral_type_size[sizeof (long int)] = LONG; + + for (i = 0; i <= MAX_FP_TYPE_SIZE; i++) + fp_type_size[i] = NO_SIZE; + + fp_type_size[sizeof (float)] = FP_SINGLE; + /* The array entry for `double' is filled in after that for LONG_DOUBLE + so that if `long double' is the same type or if long double isn't + supported FP_LONG_DOUBLE will never be used. */ + fp_type_size[sizeof (LONG_DOUBLE)] = FP_LONG_DOUBLE; + fp_type_size[sizeof (double)] = FP_DOUBLE; + + n_specs = 0; + n_specs_allocated = 5; + spec = (struct tspec *) xmalloc (n_specs_allocated * sizeof (struct tspec)); + + output_address_fmt_string = "%07o"; + address_pad_len = 7; + flag_dump_strings = 0; + + while ((c = getopt_long (argc, argv, "abcdfhilos::xw::A:j:N:t:v", + long_options, (int *) 0)) + != EOF) + { + strtoul_error err; + + switch (c) + { + case 'A': + switch (optarg[0]) + { + case 'd': + output_address_fmt_string = "%07d"; + address_pad_len = 7; + break; + case 'o': + output_address_fmt_string = "%07o"; + address_pad_len = 7; + break; + case 'x': + output_address_fmt_string = "%06x"; + address_pad_len = 6; + break; + case 'n': + output_address_fmt_string = NULL; + address_pad_len = 0; + break; + default: + error (2, 0, + "invalid output address radix `%c'; it must be one character from [doxn]", + optarg[0]); + break; + } + break; + + case 'j': + err = my_strtoul (optarg, 0, &n_bytes_to_skip, 1); + if (err != UINT_OK) + uint_fatal_error (optarg, "skip argument", err); + break; + + case 'N': + limit_bytes_to_format = 1; + + err = my_strtoul (optarg, 0, &max_bytes_to_format, 1); + if (err != UINT_OK) + uint_fatal_error (optarg, "limit argument", err); + break; + + case 's': + if (optarg == NULL) + string_min = 3; + else + { + err = my_strtoul (optarg, 0, &string_min, 1); + if (err != UINT_OK) + uint_fatal_error (optarg, "minimum string length", err); + } + ++flag_dump_strings; + break; + + case 't': + if (decode_format_string (optarg)) + error (2, 0, "invalid type string `%s'", optarg); + break; + + case 'v': + abbreviate_duplicate_blocks = 0; + break; + + /* The next several cases map the old, pre-POSIX format + specification options to the corresponding POSIX format + specs. GNU od accepts any combination of old- and + new-style options. If only POSIX format specs are used + and more than one is used, they are accumulated. If only + old-style options are used, all but the last are ignored. + If both types of specs are used in the same command, the + last old-style option and any POSIX specs following it + are accumulated. To illustrate, `od -c -t a' is the same + as `od -t ca', but `od -t a -c' is the same as `od -c'. */ + +#define CASE_OLD_ARG(old_char,new_string) \ + case old_char: \ + { \ + const char *next; \ + int tmp; \ + assert (n_specs_allocated >= 1); \ + tmp = decode_one_format (new_string, &next, &(spec[0])); \ + n_specs = 1; \ + assert (tmp == 0); \ + assert (*next == '\0'); \ + } \ + break + + CASE_OLD_ARG ('a', "a"); + CASE_OLD_ARG ('b', "oC"); + CASE_OLD_ARG ('c', "c"); + CASE_OLD_ARG ('d', "u2"); + CASE_OLD_ARG ('f', "fF"); + CASE_OLD_ARG ('h', "x2"); + CASE_OLD_ARG ('i', "d2"); + CASE_OLD_ARG ('l', "d4"); + CASE_OLD_ARG ('o', "o2"); + CASE_OLD_ARG ('x', "x2"); + +#undef CASE_OLD_ARG + + case 'w': + width_specified = 1; + if (optarg == NULL) + { + desired_width = 32; + } + else + { + err = my_strtoul (optarg, 10, &desired_width, 0); + if (err != UINT_OK) + error (2, 0, "invalid width specification `%s'", optarg); + } + break; + + default: + usage (); + break; + } + } + + if (flag_dump_strings && n_specs > 0) + error (2, 0, "no type may be specified when dumping strings"); + + assert (address_pad_len <= MAX_ADDRESS_LENGTH); + for (i = 0; i < address_pad_len; i++) + address_pad[i] = ' '; + address_pad[address_pad_len] = '\0'; + + if (n_specs == 0) + { + int err = decode_one_format ("o2", NULL, &(spec[0])); + + assert (err == 0); + n_specs = 1; + } + + n_files = argc - optind; + if (n_files > 0) + file_list = (char const *const *) &argv[optind]; + else + { + /* If no files were listed on the command line, set up the + global array FILE_LIST so that it contains the null-terminated + list of one name: "-". */ + static char const * const default_file_list[] = {"-", NULL}; + + file_list = default_file_list; + } + + skip (n_bytes_to_skip); + + /* Compute output block length. */ + l_c_m = get_lcm (); + + if (width_specified) + { + if (desired_width != 0 && desired_width % l_c_m == 0) + bytes_per_block = desired_width; + else + { + error (0, 0, "warning: invalid width %d; using %d instead", + desired_width, l_c_m); + bytes_per_block = l_c_m; + } + } + else + { + if (l_c_m < DEFAULT_BYTES_PER_BLOCK) + bytes_per_block = l_c_m * (int) (DEFAULT_BYTES_PER_BLOCK / l_c_m); + else + bytes_per_block = l_c_m; + } + +#ifdef DEBUG + for (i = 0; i < n_specs; i++) + { + printf ("%d: fmt=\"%s\" width=%d\n", + i, spec[i].fmt_string, width_bytes[spec[i].size]); + } +#endif + + if (flag_dump_strings) + { + dump_strings (); + } + else + { + dump (); + } + + exit (0); +} diff --git a/src/paste.c b/src/paste.c new file mode 100644 index 0000000..c7058a6 --- /dev/null +++ b/src/paste.c @@ -0,0 +1,458 @@ +/* paste - merge lines of files + Copyright (C) 1984 by David M. Ihnat + + This program is a total rewrite of the Bell Laboratories Unix(Tm) + command of the same name, as of System V. It contains no proprietary + code, and therefore may be used without violation of any proprietary + agreements whatsoever. However, you will notice that the program is + copyrighted by me. This is to assure the program does *not* fall + into the public domain. Thus, I may specify just what I am now: + This program may be freely copied and distributed, provided this notice + remains; it may not be sold for profit without express written consent of + the author. + Please note that I recreated the behavior of the Unix(Tm) 'paste' command + as faithfully as possible, with minor exceptions; however, + I haven't run a full set of regression tests. Thus, the user of + this program accepts full responsibility for any effects or loss; + in particular, the author is not responsible for any losses, + explicit or incidental, that may be incurred through use of this program. + + I ask that any bugs (and, if possible, fixes) be reported to me when + possible. -David Ihnat (312) 784-4544 ignatz@homebru.chi.il.us + + The list of valid escape sequences has been expanded over the Unix + version, to include \b, \f, \r, and \v. + + POSIX changes, bug fixes, long-named options, and cleanup + by David MacKenzie . + + Options: + --serial + -s Paste one file at a time rather than + one line from each file. + --delimiters=delim-list + -d delim-list Consecutively use the characters in + DELIM-LIST instead of tab to separate + merged lines. When DELIM-LIST is exhausted, + start again at its beginning. + A FILE of `-' means standard input. + If no FILEs are given, standard input is used. */ + +#include +#include +#include +#include "system.h" + +char *collapse_escapes (); +char *xmalloc (); +char *xrealloc (); +int paste_parallel (); +int paste_serial (); +void error (); +void usage (); + +/* Indicates that no delimiter should be added in the current position. */ +#define EMPTY_DELIM '\0' + +/* Element marking a file that has reached EOF and been closed. */ +#define CLOSED ((FILE *) -1) + +/* Element marking end of list of open files. */ +#define ENDLIST ((FILE *) -2) + +/* Name this program was run with. */ +char *program_name; + +/* If nonzero, we have read standard input at some point. */ +int have_read_stdin; + +/* If nonzero, merge subsequent lines of each file rather than + corresponding lines from each file in parallel. */ +int serial_merge; + +/* The delimeters between lines of input files (used cyclically). */ +char *delims; + +/* A pointer to the character after the end of `delims'. */ +char *delim_end; + +struct option longopts[] = +{ + {"serial", 0, 0, 's'}, + {"delimiters", 1, 0, 'd'}, + {0, 0, 0, 0} +}; + +void +main (argc, argv) + int argc; + char **argv; +{ + int optc, exit_status; + char default_delims[2], zero_delims[3]; + + program_name = argv[0]; + have_read_stdin = 0; + serial_merge = 0; + delims = default_delims; + strcpy (delims, "\t"); + strcpy (zero_delims, "\\0"); + + while ((optc = getopt_long (argc, argv, "d:s", longopts, (int *) 0)) + != EOF) + { + switch (optc) + { + case 'd': + /* Delimiter character(s). */ + if (optarg[0] == '\0') + optarg = zero_delims; + delims = optarg; + break; + + case 's': + serial_merge++; + break; + + default: + usage (); + } + } + + if (optind == argc) + argv[argc++] = "-"; + + delim_end = collapse_escapes (delims); + + if (!serial_merge) + exit_status = paste_parallel (argc - optind, &argv[optind]); + else + exit_status = paste_serial (argc - optind, &argv[optind]); + if (have_read_stdin && fclose (stdin) == EOF) + error (1, errno, "-"); + if (ferror (stdout) || fclose (stdout) == EOF) + error (1, errno, "write error"); + exit (exit_status); +} + +/* Replace backslash representations of special characters in + STRPTR with their actual values. + The set of possible backslash characters has been expanded beyond + that recognized by the Unix version. + + Return a pointer to the character after the new end of STRPTR. */ + +char * +collapse_escapes (strptr) + char *strptr; +{ + register char *strout; + + strout = strptr; /* Start at the same place, anyway. */ + + while (*strptr) + { + if (*strptr != '\\') /* Is it an escape character? */ + *strout++ = *strptr++; /* No, just transfer it. */ + else + { + switch (*++strptr) + { + case '0': + *strout++ = EMPTY_DELIM; + break; + + case 'b': + *strout++ = '\b'; + break; + + case 'f': + *strout++ = '\f'; + break; + + case 'n': + *strout++ = '\n'; + break; + + case 'r': + *strout++ = '\r'; + break; + + case 't': + *strout++ = '\t'; + break; + + case 'v': + *strout++ = '\v'; + break; + + default: + *strout++ = *strptr; + break; + } + strptr++; + } + } + return strout; +} + +/* Perform column paste on the NFILES files named in FNAMPTR. + Return 0 if no errors, 1 if one or more files could not be + opened or read. */ + +int +paste_parallel (nfiles, fnamptr) + int nfiles; + char **fnamptr; +{ + int errors = 0; /* 1 if open or read errors occur. */ + /* Number of files for which space is allocated in `delbuf' and `fileptr'. + Enlarged as necessary. */ + int file_list_size = 12; + int chr; /* Input character. */ + int line_length; /* Number of chars in line. */ + int somedone; /* 0 if all files empty for this line. */ + /* If all files are just ready to be closed, or will be on this + round, the string of delimiters must be preserved. + delbuf[0] through delbuf[file_list_size] + store the delimiters for closed files. */ + char *delbuf; + int delims_saved; /* Number of delims saved in `delbuf'. */ + register char *delimptr; /* Cycling pointer into `delims'. */ + FILE **fileptr; /* Streams open to the files to process. */ + int files_open; /* Number of files still open to process. */ + int i; /* Loop index. */ + int opened_stdin = 0; /* Nonzero if any fopen got fd 0. */ + + delbuf = (char *) xmalloc (file_list_size + 2); + fileptr = (FILE **) xmalloc ((file_list_size + 1) * sizeof (FILE *)); + + /* Attempt to open all files. This could be expanded to an infinite + number of files, but at the (considerable) expense of remembering + each file and its current offset, then opening/reading/closing. */ + + for (files_open = 0; files_open < nfiles; ++files_open) + { + if (files_open == file_list_size - 2) + { + file_list_size += 12; + delbuf = (char *) xrealloc (delbuf, file_list_size + 2); + fileptr = (FILE **) xrealloc (fileptr, (file_list_size + 1) + * sizeof (FILE *)); + } + if (!strcmp (fnamptr[files_open], "-")) + { + have_read_stdin = 1; + fileptr[files_open] = stdin; + } + else + { + fileptr[files_open] = fopen (fnamptr[files_open], "r"); + if (fileptr[files_open] == NULL) + error (1, errno, "%s", fnamptr[files_open]); + else if (fileno (fileptr[files_open]) == 0) + opened_stdin = 1; + } + } + + fileptr[files_open] = ENDLIST; + + if (opened_stdin && have_read_stdin) + error (1, 0, "standard input is closed"); + + /* Read a line from each file and output it to stdout separated by a + delimiter, until we go through the loop without successfully + reading from any of the files. */ + + while (files_open) + { + /* Set up for the next line. */ + somedone = 0; + delimptr = delims; + delims_saved = 0; + + for (i = 0; fileptr[i] != ENDLIST && files_open; i++) + { + line_length = 0; /* Clear so we can easily detect EOF. */ + if (fileptr[i] != CLOSED) + { + chr = getc (fileptr[i]); + if (chr != EOF && delims_saved) + { + fwrite (delbuf, sizeof (char), delims_saved, stdout); + delims_saved = 0; + } + + while (chr != EOF) + { + line_length++; + if (chr == '\n') + break; + putc (chr, stdout); + chr = getc (fileptr[i]); + } + } + + if (line_length == 0) + { + /* EOF, read error, or closed file. + If an EOF or error, close the file and mark it in the list. */ + if (fileptr[i] != CLOSED) + { + if (ferror (fileptr[i])) + { + error (0, errno, "%s", fnamptr[i]); + errors = 1; + } + if (fileptr[i] == stdin) + clearerr (fileptr[i]); /* Also clear EOF. */ + else if (fclose (fileptr[i]) == EOF) + { + error (0, errno, "%s", fnamptr[i]); + errors = 1; + } + + fileptr[i] = CLOSED; + files_open--; + } + + if (fileptr[i + 1] == ENDLIST) + { + /* End of this output line. + Is this the end of the whole thing? */ + if (somedone) + { + /* No. Some files were not closed for this line. */ + if (delims_saved) + { + fwrite (delbuf, sizeof (char), delims_saved, stdout); + delims_saved = 0; + } + putc ('\n', stdout); + } + continue; /* Next read of files, or exit. */ + } + else + { + /* Closed file; add delimiter to `delbuf'. */ + if (*delimptr != EMPTY_DELIM) + delbuf[delims_saved++] = *delimptr; + if (++delimptr == delim_end) + delimptr = delims; + } + } + else + { + /* Some data read. */ + somedone++; + + /* Except for last file, replace last newline with delim. */ + if (fileptr[i + 1] != ENDLIST) + { + if (chr != '\n') + putc (chr, stdout); + if (*delimptr != EMPTY_DELIM) + putc (*delimptr, stdout); + if (++delimptr == delim_end) + delimptr = delims; + } + else + putc (chr, stdout); + } + } + } + return errors; +} + +/* Perform serial paste on the NFILES files named in FNAMPTR. + Return 0 if no errors, 1 if one or more files could not be + opened or read. */ + +int +paste_serial (nfiles, fnamptr) + int nfiles; + char **fnamptr; +{ + int errors = 0; /* 1 if open or read errors occur. */ + register int charnew, charold; /* Current and previous char read. */ + register char *delimptr; /* Current delimiter char. */ + register FILE *fileptr; /* Open for reading current file. */ + + for (; nfiles; nfiles--, fnamptr++) + { + if (!strcmp (*fnamptr, "-")) + { + have_read_stdin = 1; + fileptr = stdin; + } + else + { + fileptr = fopen (*fnamptr, "r"); + if (fileptr == NULL) + { + error (0, errno, "%s", *fnamptr); + errors = 1; + continue; + } + } + + delimptr = delims; /* Set up for delimiter string. */ + + charold = getc (fileptr); + if (charold != EOF) + { + /* `charold' is set up. Hit it! + Keep reading characters, stashing them in `charnew'; + output `charold', converting to the appropriate delimiter + character if needed. After the EOF, output `charold' + if it's a newline; otherwise, output it and then a newline. */ + + while ((charnew = getc (fileptr)) != EOF) + { + /* Process the old character. */ + if (charold == '\n') + { + if (*delimptr != EMPTY_DELIM) + putc (*delimptr, stdout); + + if (++delimptr == delim_end) + delimptr = delims; + } + else + putc (charold, stdout); + + charold = charnew; + } + + /* Hit EOF. Process that last character. */ + putc (charold, stdout); + } + + if (charold != '\n') + putc ('\n', stdout); + + if (ferror (fileptr)) + { + error (0, errno, "%s", *fnamptr); + errors = 1; + } + if (fileptr == stdin) + clearerr (fileptr); /* Also clear EOF. */ + else if (fclose (fileptr) == EOF) + { + error (0, errno, "%s", *fnamptr); + errors = 1; + } + } + return errors; +} + +void +usage () +{ + fprintf (stderr, "\ +Usage: %s [-s] [-d delim-list] [--serial] [--delimiters=delim-list]\n\ + [file...]\n", + program_name); + exit (1); +} diff --git a/src/pr.c b/src/pr.c new file mode 100644 index 0000000..10595ad --- /dev/null +++ b/src/pr.c @@ -0,0 +1,1844 @@ +/* pr -- convert text files for printing. + Copyright (C) 1988, 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* Author: Pete TerMaat. */ + +/* Things to watch: Sys V screws up on ... + pr -n -3 -s: /usr/dict/words + pr -m -o10 -n /usr/dict/words{,,,} + pr -6 -a -n -o5 /usr/dict/words + + Ideas: + + Keep a things_to_do list of functions to call when we know we have + something to print. Cleaner than current series of checks. + + Improve the printing of control prefixes. + + + Options: + + +PAGE Begin output at page PAGE of the output. + + -COLUMN Produce output that is COLUMN columns wide and print + columns down. + + -a Print columns across rather than down. The input + one + two + three + four + will be printed as + one two three + four + + -b Balance columns on the last page. + + -c Print unprintable characters as control prefixes. + Control-g is printed as ^G. + + -d Double space the output. + + -e[c[k]] Expand tabs to spaces on input. Optional argument C + is the input tab character. (Default is `\t'.) Optional + argument K is the input tab character's width. (Default is 8.) + + -F + -f Use formfeeds instead of newlines to separate pages. + + -h header Replace the filename in the header with the string HEADER. + + -i[c[k]] Replace spaces with tabs on output. Optional argument + C is the output tab character. (Default is `\t'.) Optional + argument K is the output tab character's width. (Default + is 8.) + + -l lines Set the page length to LINES. Default is 66. + + -m Print files in parallel. + + -n[c[k]] Precede each column with a line number. + (With parallel files, precede each line with a line + number.) Optional argument C is the character to print + after each number. (Default `\t'.) Optional argument + K is the number of digits per line number. (Default 5.) + + -o offset Offset each line with a margin OFFSET spaces wide. + Total page width is the size of this offset plus the + width set with `-w'. + + -r Ignore files that can't be opened. + + -s[c] Separate each line with a character. Optional argument C is + the character to be used. Default is `\t'. + + -t Do not print headers or footers. + + -v Print unprintable characters as escape sequences. + Control-G becomes \007. + + -w width Set the page width to WIDTH characters. */ + +#include +#include +#include +#include +#include +#include "system.h" + +#ifdef isascii +#define ISPRINT(c) (isascii (c) && isprint (c)) +#define ISDIGIT(c) (isascii (c) && isdigit (c)) +#else +#define ISPRINT(c) isprint (c) +#define ISDIGIT(c) isdigit (c) +#endif + +int char_to_clump (); +int read_line (); +int print_page (); +int print_stored (); +char *xmalloc (); +char *xrealloc (); +int open_file (); +int skip_to_page (); +void error (); +void getoptarg (); +void usage (); +void print_files (); +void init_header (); +void init_store_cols (); +void store_columns (); +void balance (); +void store_char (); +void pad_down (); +void read_rest_of_line (); +void print_char (); +void cleanup (); + +#ifndef TRUE +#define TRUE 1 +#define FALSE 0 +#endif + +/* Used with start_position in the struct COLUMN described below. + If start_position == ANYWHERE, we aren't truncating columns and + can begin printing a column anywhere. Otherwise we must pad to + the horizontal position start_position. */ +#define ANYWHERE 0 + +/* Each column has one of these structures allocated for it. + If we're only dealing with one file, fp is the same for all + columns. + + The general strategy is to spend time setting up these column + structures (storing columns if necessary), after which printing + is a matter of flitting from column to column and calling + print_func. + + Parallel files, single files printing across in multiple + columns, and single files printing down in multiple columns all + fit the same printing loop. + + print_func Function used to print lines in this column. + If we're storing this column it will be + print_stored(), Otherwise it will be read_line(). + + char_func Function used to process characters in this column. + If we're storing this column it will be store_char(), + otherwise it will be print_char(). + + current_line Index of the current entry in line_vector, which + contains the index of the first character of the + current line in buff[]. + + lines_stored Number of lines in this column which are stored in + buff. + + lines_to_print If we're storing this column, lines_to_print is + the number of stored_lines which remain to be + printed. Otherwise it is the number of lines + we can print without exceeding lines_per_body. + + start_position The horizontal position we want to be in before we + print the first character in this column. + + numbered True means precede this column with a line number. */ + +struct COLUMN +{ + FILE *fp; /* Input stream for this column. */ + char *name; /* File name. */ + enum + { + OPEN, + ON_HOLD, /* Hit a form feed. */ + CLOSED + } status; /* Status of the file pointer. */ + int (*print_func) (); /* Func to print lines in this col. */ + void (*char_func) (); /* Func to print/store chars in this col. */ + int current_line; /* Index of current place in line_vector. */ + int lines_stored; /* Number of lines stored in buff. */ + int lines_to_print; /* No. lines stored or space left on page. */ + int start_position; /* Horizontal position of first char. */ + int numbered; +}; + +typedef struct COLUMN COLUMN; + +#define NULLCOL (COLUMN *)0 + +/* All of the columns to print. */ +COLUMN *column_vector; + +/* When printing a single file in multiple downward columns, + we store the leftmost columns contiguously in buff. + To print a line from buff, get the index of the first char + from line_vector[i], and print up to line_vector[i + 1]. */ +char *buff; + +/* Index of the position in buff where the next character + will be stored. */ +int buff_current; + +/* The number of characters in buff. + Used for allocation of buff and to detect overflow of buff. */ +int buff_allocated; + +/* Array of indices into buff. + Each entry is an index of the first character of a line. + This is used when storing lines to facilitate shuffling when + we do column balancing on the last page. */ +int *line_vector; + +/* Array of horizonal positions. + For each line in line_vector, end_vector[line] is the horizontal + position we are in after printing that line. We keep track of this + so that we know how much we need to pad to prepare for the next + column. */ +int *end_vector; + +/* (-m) True means we're printing multiple files in parallel. */ +int parallel_files = FALSE; + +/* (-[0-9]+) True means we're given an option explicitly specifying + number of columns. Used to detect when this option is used with -m. */ +int explicit_columns = FALSE; + +/* (-t) True means we're printing headers and footers. */ +int extremities = TRUE; + +/* True means we need to print a header as soon as we know we've got input + to print after it. */ +int print_a_header; + +/* (-h) True means we're using the standard header rather than a + customized one specified by the -h flag. */ +int standard_header = TRUE; + +/* (-f) True means use formfeeds instead of newlines to separate pages. */ +int use_form_feed = FALSE; + +/* True means we haven't encountered any filenames in the argument list. */ +int input_is_stdin = TRUE; + +/* True means we have read the standard input. */ +int have_read_stdin = FALSE; + +/* True means the -a flag has been given. */ +int print_across_flag = FALSE; + +/* True means we're printing one file in multiple (>1) downward columns. */ +int storing_columns = TRUE; + +/* (-b) True means balance columns on the last page as Sys V does. */ +int balance_columns = FALSE; + +/* (-l) Number of lines on a page, including header and footer lines. */ +int lines_per_page = 66; + +/* Number of lines in the header and footer can be reset to 0 using + the -t flag. */ +int lines_per_header = 5; +int lines_per_body; +int lines_per_footer = 5; + +/* (-w) Width in characters of the page. Does not include the width of + the margin. */ +int chars_per_line = 72; + +/* Number of characters in a column. Based on the gutter and page widths. */ +int chars_per_column; + +/* (-e) True means convert tabs to spaces on input. */ +int untabify_input = FALSE; + +/* (-e) The input tab character. */ +char input_tab_char = '\t'; + +/* (-e) Tabstops are at chars_per_tab, 2*chars_per_tab, 3*chars_per_tab, ... + where the leftmost column is 1. */ +int chars_per_input_tab = 8; + +/* (-i) True means convert spaces to tabs on output. */ +int tabify_output = FALSE; + +/* (-i) The output tab character. */ +char output_tab_char = '\t'; + +/* (-i) The width of the output tab. */ +int chars_per_output_tab = 8; + +/* Keeps track of pending white space. When we hit a nonspace + character after some whitespace, we print whitespace, tabbing + if necessary to get to output_position + spaces_not_printed. */ +int spaces_not_printed; + +/* Number of spaces between columns (though tabs can be used when possible to + use up the equivalent amount of space). Not sure if this is worth making + a flag for. BSD uses 0, Sys V uses 1. Sys V looks better. */ +int chars_per_gutter = 1; + +/* (-o) Number of spaces in the left margin (tabs used when possible). */ +int chars_per_margin = 0; + +/* Position where the next character will fall. + Leftmost position is 0 + chars_per_margin. + Rightmost position is chars_per_margin + chars_per_line - 1. + This is important for converting spaces to tabs on output. */ +int output_position; + +/* Horizontal position relative to the current file. + (output_position depends on where we are on the page; + input_position depends on where we are in the file.) + Important for converting tabs to spaces on input. */ +int input_position; + +/* Count number of failed opens so we can exit with non-zero + status if there were any. */ +int failed_opens = 0; + +/* The horizontal position we'll be at after printing a tab character + of width c_ from the position h_. */ +#define pos_after_tab(c_, h_) h_ - h_ % c_ + c_ + +/* The number of spaces taken up if we print a tab character with width + c_ from position h_. */ +#define tab_width(c_, h_) - h_ % c_ + c_ + +/* (-NNN) Number of columns of text to print. */ +int columns = 1; + +/* (+NNN) Page number on which to begin printing. */ +int first_page_number = 1; + +/* Number of files open (not closed, not on hold). */ +int files_ready_to_read = 0; + +/* Number of columns with either an open file or stored lines. */ +int cols_ready_to_print = 0; + +/* Current page number. Displayed in header. */ +int page_number; + +/* Current line number. Displayed when -n flag is specified. + + When printing files in parallel (-m flag), line numbering is as follows: + 1 foo goo moo + 2 hoo too zoo + + When printing files across (-a flag), ... + 1 foo 2 moo 3 goo + 4 hoo 3 too 6 zoo + + Otherwise, line numbering is as follows: + 1 foo 3 goo 5 too + 2 moo 4 hoo 6 zoo */ +int line_number; + +/* (-n) True means lines should be preceded by numbers. */ +int numbered_lines = FALSE; + +/* True means print a number as soon as we know we'll be printing + from the current column. */ +int print_a_number; + +/* (-n) Character which follows each line number. */ +char number_separator = '\t'; + +/* (-n) Width in characters of a line number. */ +int chars_per_number = 5; + +/* Used when widening the first column to accommodate numbers -- only + needed when printing files in parallel. Includes width of both the + number and the number_separator. */ +int number_width; + +/* Buffer sprintf uses to format a line number. */ +char *number_buff; + +/* (-v) True means unprintable characters are printed as escape sequences. + control-g becomes \007. */ +int use_esc_sequence = FALSE; + +/* (-c) True means unprintable characters are printed as control prefixes. + control-g becomes ^G. */ +int use_cntrl_prefix = FALSE; + +/* (-d) True means output is double spaced. */ +int double_space = FALSE; + +/* Number of files opened initially in init_files. Should be 1 + unless we're printing multiple files in parallel. */ +int total_files = 0; + +/* (-r) True means don't complain if we can't open a file. */ +int ignore_failed_opens = FALSE; + +/* (-s) True means we separate columns with a specified character. */ +int use_column_separator = FALSE; + +/* Character used to separate columns if the the -s flag has been specified. */ +char column_separator = '\t'; + +/* Number of separator characters waiting to be printed as soon as we + know that we have any input remaining to be printed. */ +int separators_not_printed; + +/* Position we need to pad to, as soon as we know that we have input + remaining to be printed. */ +int padding_not_printed; + +/* True means we should pad the end of the page. Remains false until we + know we have a page to print. */ +int pad_vertically; + +/* (-h) String of characters used in place of the filename in the header. */ +char *custom_header; + +/* String containing the date, filename or custom header, and "Page ". */ +char *header; + +int *clump_buff; + +/* True means we truncate lines longer than chars_per_column. */ +int truncate_lines = FALSE; + +/* The name under which this program was invoked. */ +char *program_name; + +void +main (argc, argv) + int argc; + char **argv; +{ + int c; + char *s; + int files = 0; + char **file_names, **file_name_vector; + int accum = 0; + + program_name = argv[0]; + + file_name_vector = (char **) xmalloc (argc * sizeof (char **)); + file_names = file_name_vector; + + for (;;) + { + c = getopt (argc, argv, "-0123456789abcde::fFh:i::l:mn::o:rs::tvw:"); + + if (c == 1) /* Non-option argument. */ + { + s = optarg; + if (*s == '+') + { + if (!ISDIGIT (*++s)) + usage ("`+' requires a numeric argument"); + first_page_number = atoi (s); + } + else + { + *file_names++ = optarg; + ++files; + } + } + else if (files > 0) + { + if (parallel_files && explicit_columns) + error (1, 0, +"Cannot specify number of columns when printing in parallel."); + + if (parallel_files && print_across_flag) + error (1, 0, +"Cannot specify both printing across and printing in parallel."); + + if (parallel_files) + print_files (files, file_name_vector); + else + { + file_names = file_name_vector; + while (files--) + print_files (1, file_names++); + } + + input_is_stdin = FALSE; + file_names = file_name_vector; + files = 0; + cleanup (); + } + + if (ISDIGIT (c)) + { + accum = accum * 10 + c - '0'; + continue; + } + else + { + if (accum > 0) + { + columns = accum; + explicit_columns = TRUE; + } + accum = 0; + } + + switch (c) + { + case 'a': + print_across_flag = TRUE; + storing_columns = FALSE; + break; + case 'b': + balance_columns = TRUE; + break; + case 'c': + use_cntrl_prefix = TRUE; + break; + case 'd': + double_space = TRUE; + break; + case 'e': + if (optarg) + getoptarg (optarg, 'e', &input_tab_char, + &chars_per_input_tab); + /* Could check tab width > 0. */ + untabify_input = TRUE; + break; + case 'f': + case 'F': + use_form_feed = TRUE; + break; + case 'h': + custom_header = optarg; + standard_header = FALSE; + break; + case 'i': + if (optarg) + getoptarg (optarg, 'i', &output_tab_char, + &chars_per_output_tab); + /* Could check tab width > 0. */ + tabify_output = TRUE; + break; + case 'l': + lines_per_page = atoi (optarg); + break; + case 'm': + parallel_files = TRUE; + storing_columns = FALSE; + break; + case 'n': + numbered_lines = TRUE; + if (optarg) + getoptarg (optarg, 'n', &number_separator, + &chars_per_number); + break; + case 'o': + chars_per_margin = atoi (optarg); + break; + case 'r': + ignore_failed_opens = TRUE; + break; + case 's': + use_column_separator = TRUE; + if (optarg) + { + s = optarg; + column_separator = *s; + if (*++s) + { + fprintf (stderr, "\ +%s: extra characters in the argument to the `-s' option: `%s'\n", + program_name, s); + usage ((char *) 0); + } + } + break; + case 't': + extremities = FALSE; + break; + case 'v': + use_esc_sequence = TRUE; + break; + case 'w': + chars_per_line = atoi (optarg); + break; + case '?': + usage ((char *) 0); + break; + } + + if (c == EOF) + break; + } + + if (input_is_stdin) + print_files (0, (char **) 0); + + if (have_read_stdin && fclose (stdin) == EOF) + error (1, errno, "standard input"); + if (ferror (stdout) || fclose (stdout) == EOF) + error (1, errno, "write error"); + if (failed_opens > 0) + exit(1); + exit (0); +} + +/* Parse options of the form -scNNN. + + Example: -nck, where 'n' is the option, c is the optional number + separator, and k is the optional width of the field used when printing + a number. */ + +void +getoptarg (arg, switch_char, character, number) + char *arg, switch_char, *character; + int *number; +{ + if (!ISDIGIT (*arg)) + *character = *arg++; + if (*arg) + { + if (ISDIGIT (*arg)) + *number = atoi (arg); + else + { + fprintf (stderr, "\ +%s: extra characters in the argument to the `-%c' option: `%s'\n", + program_name, switch_char, arg); + usage ((char *) 0); + } + } +} + +/* Set parameters related to formatting. */ + +void +init_parameters (number_of_files) + int number_of_files; +{ + int chars_used_by_number = 0; + + lines_per_body = lines_per_page - lines_per_header - lines_per_footer; + if (lines_per_body <= 0) + extremities = FALSE; + if (extremities == FALSE) + lines_per_body = lines_per_page; + + if (double_space) + lines_per_body = lines_per_body / 2; + + /* If input is stdin, cannot print parallel files. BSD dumps core + on this. */ + if (number_of_files == 0) + parallel_files = FALSE; + + if (parallel_files) + columns = number_of_files; + + /* Tabification is assumed for multiple columns. */ + if (columns > 1) + { + if (!use_column_separator) + truncate_lines = TRUE; + + untabify_input = TRUE; + tabify_output = TRUE; + } + else + storing_columns = FALSE; + + if (numbered_lines) + { + if (number_separator == input_tab_char) + { + number_width = chars_per_number + + tab_width (chars_per_input_tab, + (chars_per_margin + chars_per_number)); + } + else + number_width = chars_per_number + 1; + /* The number is part of the column width unless we are + printing files in parallel. */ + if (parallel_files) + chars_used_by_number = number_width; + } + + chars_per_column = (chars_per_line - chars_used_by_number - + (columns - 1) * chars_per_gutter) / columns; + + if (chars_per_column < 1) + error (1, 0, "page width too narrow"); + + if (numbered_lines) + { + if (number_buff != (char *) 0) + free (number_buff); + number_buff = (char *) + xmalloc (2 * chars_per_number * sizeof (char)); + } + + /* Pick the maximum between the tab width and the width of an + escape sequence. */ + if (clump_buff != (int *) 0) + free (clump_buff); + clump_buff = (int *) xmalloc ((chars_per_input_tab > 4 + ? chars_per_input_tab : 4) * sizeof (int)); +} + +/* Open the necessary files, + maintaining a COLUMN structure for each column. + + With multiple files, each column p has a different p->fp. + With single files, each column p has the same p->fp. + Return 1 if (number_of_files > 0) and no files can be opened, + 0 otherwise. */ + +int +init_fps (number_of_files, av) + int number_of_files; + char **av; +{ + int i, files_left; + COLUMN *p; + FILE *firstfp; + char *firstname; + + total_files = 0; + + if (column_vector != NULLCOL) + free ((char *) column_vector); + column_vector = (COLUMN *) xmalloc (columns * sizeof (COLUMN)); + + if (parallel_files) + { + files_left = number_of_files; + for (p = column_vector; files_left--; ++p, ++av) + { + if (open_file (*av, p) == 0) + { + --p; + --columns; + } + } + if (columns == 0) + return 1; + init_header ("", -1); + } + else + { + p = column_vector; + if (number_of_files > 0) + { + if (open_file (*av, p) == 0) + return 1; + init_header (*av, fileno (p->fp)); + } + else + { + p->name = "standard input"; + p->fp = stdin; + have_read_stdin = TRUE; + p->status = OPEN; + ++total_files; + init_header ("", -1); + } + + firstname = p->name; + firstfp = p->fp; + for (i = columns - 1, ++p; i; --i, ++p) + { + p->name = firstname; + p->fp = firstfp; + p->status = OPEN; + } + } + files_ready_to_read = total_files; + return 0; +} + +/* Determine print_func and char_func, the functions + used by each column for printing and/or storing. + + Determine the horizontal position desired when we begin + printing a column (p->start_position). */ + +void +init_funcs () +{ + int i, h, h_next; + COLUMN *p; + + h = chars_per_margin; + + if (use_column_separator) + h_next = ANYWHERE; + else + { + /* When numbering lines of parallel files, we enlarge the + first column to accomodate the number. Looks better than + the Sys V approach. */ + if (parallel_files && numbered_lines) + h_next = h + chars_per_column + number_width; + else + h_next = h + chars_per_column; + } + + /* This loop takes care of all but the rightmost column. */ + + for (p = column_vector, i = 1; i < columns; ++p, ++i) + { + if (storing_columns) /* One file, multi columns down. */ + { + p->char_func = store_char; + p->print_func = print_stored; + } + else + /* One file, multi columns across; or parallel files. */ + { + p->char_func = print_char; + p->print_func = read_line; + } + + /* Number only the first column when printing files in + parallel. */ + p->numbered = numbered_lines && (!parallel_files || i == 1); + p->start_position = h; + + /* If we're using separators, all start_positions are + ANYWHERE, except the first column's start_position when + using a margin. */ + + if (use_column_separator) + { + h = ANYWHERE; + h_next = ANYWHERE; + } + else + { + h = h_next + chars_per_gutter; + h_next = h + chars_per_column; + } + } + + /* The rightmost column. + + Doesn't need to be stored unless we intend to balance + columns on the last page. */ + if (storing_columns && balance_columns) + { + p->char_func = store_char; + p->print_func = print_stored; + } + else + { + p->char_func = print_char; + p->print_func = read_line; + } + + p->numbered = numbered_lines && (!parallel_files || i == 1); + p->start_position = h; +} + +/* Open a file. Return nonzero if successful, zero if failed. */ + +int +open_file (name, p) + char *name; + COLUMN *p; +{ + if (!strcmp (name, "-")) + { + p->name = "standard input"; + p->fp = stdin; + have_read_stdin = 1; + } + else + { + p->name = name; + p->fp = fopen (name, "r"); + } + if (p->fp == NULL) + { + ++failed_opens; + if (!ignore_failed_opens) + error (0, errno, "%s", name); + return 0; + } + p->status = OPEN; + ++total_files; + return 1; +} + +/* Close the file in P. + + If we aren't dealing with multiple files in parallel, we change + the status of all columns in the column list to reflect the close. */ + +void +close_file (p) + COLUMN *p; +{ + COLUMN *q; + int i; + + if (p->status == CLOSED) + return; + if (ferror (p->fp)) + error (1, errno, "%s", p->name); + if (p->fp != stdin && fclose (p->fp) == EOF) + error (1, errno, "%s", p->name); + + if (!parallel_files) + { + for (q = column_vector, i = columns; i; ++q, --i) + { + q->status = CLOSED; + if (q->lines_stored == 0) + { +#if 0 + if (cols_ready_to_print > 0) + --cols_ready_to_print; +#endif + q->lines_to_print = 0; + } + } + } + else + { + p->status = CLOSED; + p->lines_to_print = 0; + } + + --files_ready_to_read; +} + +/* Put a file on hold until we start a new page, + since we've hit a form feed. + + If we aren't dealing with parallel files, we must change the + status of all columns in the column list. */ + +void +hold_file (p) + COLUMN *p; +{ + COLUMN *q; + int i; + + if (!parallel_files) + for (q = column_vector, i = columns; i; ++q, --i) + q->status = ON_HOLD; + else + p->status = ON_HOLD; + p->lines_to_print = 0; + --files_ready_to_read; +} + +/* Undo hold_file -- go through the column list and change any + ON_HOLD columns to OPEN. Used at the end of each page. */ + +void +reset_status () +{ + int i = columns; + COLUMN *p; + + for (p = column_vector; i; --i, ++p) + if (p->status == ON_HOLD) + { + p->status = OPEN; + files_ready_to_read++; + } +} + +/* Print a single file, or multiple files in parallel. + + Set up the list of columns, opening the necessary files. + Allocate space for storing columns, if necessary. + Skip to first_page_number, if user has asked to skip leading pages. + Determine which functions are appropriate to store/print lines + in each column. + Print the file(s). */ + +void +print_files (number_of_files, av) + int number_of_files; + char **av; +{ + init_parameters (number_of_files); + if (init_fps (number_of_files, av)) + return; + if (storing_columns) + init_store_cols (); + + if (first_page_number > 1) + { + if (!skip_to_page (first_page_number)) + return; + else + page_number = first_page_number; + } + else + page_number = 1; + + init_funcs (); + + line_number = 1; + while (print_page ()) + ; +} + +/* Generous estimate of number of characters taken up by "Jun 7 00:08 " and + "Page NNNNN". */ +#define CHARS_FOR_DATE_AND_PAGE 50 + +/* Initialize header information. + If DESC is non-negative, it is a file descriptor open to + FILENAME for reading. + + Allocate space for a header string, + Determine the time, insert file name or user-specified string. + + It might be nice to have a "blank headers" option, since + pr -h "" still prints the date and page number. */ + +void +init_header (filename, desc) + char *filename; + int desc; +{ + int chars_per_header; + char *f = filename; + char *t, *middle; + struct stat st; + + if (filename == 0) + f = ""; + + /* If parallel files or standard input, use current time. */ + if (desc < 0 || !strcmp (filename, "-") || fstat (desc, &st)) + st.st_mtime = time ((time_t *) 0); + t = ctime (&st.st_mtime); + + t[16] = '\0'; /* Mark end of month and time string. */ + t[24] = '\0'; /* Mark end of year string. */ + + middle = standard_header ? f : custom_header; + + chars_per_header = strlen (middle) + CHARS_FOR_DATE_AND_PAGE + 1; + if (header != (char *) 0) + free (header); + header = (char *) xmalloc (chars_per_header * sizeof (char)); + + sprintf (header, "%s %s %s Page", &t[4], &t[20], middle); +} + +/* Set things up for printing a page + + Scan through the columns ... + Determine which are ready to print + (i.e., which have lines stored or open files) + Set p->lines_to_print appropriately + (to p->lines_stored if we're storing, or lines_per_body + if we're reading straight from the file) + Keep track of this total so we know when to stop printing */ + +void +init_page () +{ + int j; + COLUMN *p; + + cols_ready_to_print = 0; + + if (storing_columns) + { + store_columns (); + for (j = columns - 1, p = column_vector; j; --j, ++p) + { + p->lines_to_print = p->lines_stored; + if (p->lines_to_print != 0) + ++cols_ready_to_print; + } + + /* Last column. */ + if (balance_columns) + { + p->lines_to_print = p->lines_stored; + if (p->lines_to_print != 0) + ++cols_ready_to_print; + } + /* Since we're not balancing columns, we don't need to store + the rightmost column. Read it straight from the file. */ + else + { + if (p->status == OPEN) + { + p->lines_to_print = lines_per_body; + ++cols_ready_to_print; + } + else + p->lines_to_print = 0; + } + } + else + for (j = columns, p = column_vector; j; --j, ++p) + if (p->status == OPEN) + { + p->lines_to_print = lines_per_body; + ++cols_ready_to_print; + } + else + p->lines_to_print = 0; +} + +/* Print one page. + + As long as there are lines left on the page and columns ready to print, + Scan across the column list + if the column has stored lines or the file is open + pad to the appropriate spot + print the column + pad the remainder of the page with \n or \f as requested + reset the status of all files -- any files which where on hold because + of formfeeds are now put back into the lineup. */ + +int +print_page () +{ + int j; + int lines_left_on_page; + COLUMN *p; + + /* Used as an accumulator (with | operator) of successive values of + pad_vertically. The trick is to set pad_vertically + to zero before each run through the inner loop, then after that + loop, it tells us whether a line was actually printed (whether a + newline needs to be output -- or two for double spacing). But those + values have to be accumulated (in pv) so we can invoke pad_down + properly after the outer loop completes. */ + int pv; + + init_page (); + + if (cols_ready_to_print == 0) + return FALSE; + + if (extremities) + print_a_header = TRUE; + + /* Don't pad unless we know a page was printed. */ + pad_vertically = FALSE; + pv = FALSE; + + lines_left_on_page = lines_per_body; + if (double_space) + lines_left_on_page *= 2; + + while (lines_left_on_page > 0 && cols_ready_to_print > 0) + { + output_position = 0; + spaces_not_printed = 0; + separators_not_printed = 0; + pad_vertically = FALSE; + + for (j = 1, p = column_vector; j <= columns; ++j, ++p) + { + input_position = 0; + if (p->lines_to_print > 0) + { + padding_not_printed = p->start_position; + + if (!(p->print_func) (p)) + read_rest_of_line (p); + pv |= pad_vertically; + + if (use_column_separator) + ++separators_not_printed; + + if (--p->lines_to_print <= 0 && --cols_ready_to_print <= 0) + break; + } + } + + if (pad_vertically) + { + putchar ('\n'); + --lines_left_on_page; + } + + if (double_space && pv && extremities) + { + putchar ('\n'); + --lines_left_on_page; + } + } + + pad_vertically = pv; + + if (pad_vertically && extremities) + pad_down (lines_left_on_page + lines_per_footer); + + reset_status (); /* Change ON_HOLD to OPEN. */ + + return TRUE; /* More pages to go. */ +} + +/* Allocate space for storing columns. + + This is necessary when printing multiple columns from a single file. + Lines are stored consecutively in buff, separated by '\0'. + (We can't use a fixed offset since with the '-s' flag lines aren't + truncated.) + + We maintain a list (line_vector) of pointers to the beginnings + of lines in buff. We allocate one more than the number of lines + because the last entry tells us the index of the last character, + which we need to know in order to print the last line in buff. */ + +void +init_store_cols () +{ + int total_lines = lines_per_body * columns; + int chars_if_truncate = total_lines * (chars_per_column + 1); + + if (line_vector != (int *) 0) + free ((int *) line_vector); + line_vector = (int *) xmalloc ((total_lines + 1) * sizeof (int *)); + + if (end_vector != (int *) 0) + free ((int *) end_vector); + end_vector = (int *) xmalloc (total_lines * sizeof (int *)); + + if (buff != (char *) 0) + free (buff); + buff_allocated = use_column_separator ? 2 * chars_if_truncate + : chars_if_truncate; /* Tune this. */ + buff = (char *) xmalloc (buff_allocated * sizeof (char)); +} + +/* Store all but the rightmost column. + (Used when printing a single file in multiple downward columns) + + For each column + set p->current_line to be the index in line_vector of the + first line in the column + For each line in the column + store the line in buff + add to line_vector the index of the line's first char + buff_start is the index in buff of the first character in the + current line. */ + +void +store_columns () +{ + int i, j; + int line = 0; + int buff_start; + int last_col; /* The rightmost column which will be saved in buff */ + COLUMN *p; + + buff_current = 0; + buff_start = 0; + + if (balance_columns) + last_col = columns; + else + last_col = columns - 1; + + for (i = 1, p = column_vector; i <= last_col; ++i, ++p) + p->lines_stored = 0; + + for (i = 1, p = column_vector; i <= last_col && files_ready_to_read; + ++i, ++p) + { + p->current_line = line; + for (j = lines_per_body; j && files_ready_to_read; --j) + + if (p->status == OPEN) /* Redundant. Clean up. */ + { + input_position = 0; + + if (!read_line (p, i)) + read_rest_of_line (p); + + if (p->status == OPEN + || buff_start != buff_current) + { + ++p->lines_stored; + line_vector[line] = buff_start; + end_vector[line++] = input_position; + buff_start = buff_current; + } + } + } + + /* Keep track of the location of the last char in buff. */ + line_vector[line] = buff_start; + + if (balance_columns && p->lines_stored != lines_per_body) + balance (line); +} + +void +balance (total_stored) + int total_stored; +{ + COLUMN *p; + int i, lines; + int first_line = 0; + + for (i = 1, p = column_vector; i <= columns; ++i, ++p) + { + lines = total_stored / columns; + if (i <= total_stored % columns) + ++lines; + + p->lines_stored = lines; + p->current_line = first_line; + + first_line += lines; + } +} + +/* Store a character in the buffer. */ + +void +store_char (c) + int c; +{ + if (buff_current >= buff_allocated) + { + /* May be too generous. */ + buff_allocated = 2 * buff_allocated; + buff = (char *) xrealloc (buff, buff_allocated * sizeof (char)); + } + buff[buff_current++] = (char) c; +} + +void +number (p) + COLUMN *p; +{ + int i; + char *s; + + sprintf (number_buff, "%*d", chars_per_number, line_number++); + s = number_buff; + for (i = chars_per_number; i > 0; i--) + (p->char_func) ((int) *s++); + + if (number_separator == input_tab_char) + { + i = number_width - chars_per_number; + while (i-- > 0) + (p->char_func) ((int) ' '); + } + else + (p->char_func) ((int) number_separator); + + if (truncate_lines && !parallel_files) + input_position += number_width; +} + +/* Print (or store) padding until the current horizontal position + is position. */ + +void +pad_across_to (position) + int position; +{ + register int h = output_position; + + if (tabify_output) + spaces_not_printed = position - output_position; + else + { + while (++h <= position) + putchar (' '); + output_position = position; + } +} + +/* Pad to the bottom of the page. + + If the user has requested a formfeed, use one. + Otherwise, use newlines. */ + +void +pad_down (lines) + int lines; +{ + register int i; + + if (use_form_feed) + putchar ('\f'); + else + for (i = lines; i; --i) + putchar ('\n'); +} + +/* Read the rest of the line. + + Read from the current column's file until an end of line is + hit. Used when we've truncated a line and we no longer need + to print or store its characters. */ + +void +read_rest_of_line (p) + COLUMN *p; +{ + register int c; + FILE *f = p->fp; + + while ((c = getc (f)) != '\n') + { + if (c == '\f') + { + hold_file (p); + break; + } + else if (c == EOF) + { + close_file (p); + break; + } + } +} + +/* If we're tabifying output, + + When print_char encounters white space it keeps track + of our desired horizontal position and delays printing + until this function is called. */ + +void +print_white_space () +{ + register int h_new; + register int h_old = output_position; + register int goal = h_old + spaces_not_printed; + + while (goal - h_old > 1 + && (h_new = pos_after_tab (chars_per_output_tab, h_old)) <= goal) + { + putchar (output_tab_char); + h_old = h_new; + } + while (++h_old <= goal) + putchar (' '); + + output_position = goal; + spaces_not_printed = 0; +} + +/* Print column separators. + + We keep a count until we know that we'll be printing a line, + then print_separators() is called. */ + +void +print_separators () +{ + for (; separators_not_printed > 0; --separators_not_printed) + print_char (column_separator); +} + +/* Print (or store, depending on p->char_func) a clump of N + characters. */ + +void +print_clump (p, n, clump) + COLUMN *p; + int n; + int *clump; +{ + while (n--) + (p->char_func) (*clump++); +} + +/* Print a character. + + If we're tabifying, all tabs have been converted to spaces by + process_char(). Keep a count of consecutive spaces, and when + a nonspace is encountered, call print_white_space() to print the + required number of tabs and spaces. */ + +void +print_char (c) + int c; +{ + if (tabify_output) + { + if (c == ' ') + { + ++spaces_not_printed; + return; + } + else if (spaces_not_printed > 0) + print_white_space (); + + /* Nonprintables are assumed to have width 0, except '\b'. */ + if (!ISPRINT (c)) + { + if (c == '\b') + --output_position; + } + else + ++output_position; + } + putchar (c); +} + +/* Skip to page PAGE before printing. */ + +int +skip_to_page (page) + int page; +{ + int n, i, j; + COLUMN *p; + + for (n = 1; n < page; ++n) + { + for (i = 1; i <= lines_per_body; ++i) + { + for (j = 1, p = column_vector; j <= columns; ++j, ++p) + read_rest_of_line (p); + } + reset_status (); + } + return files_ready_to_read > 0; +} + +/* Print a header. + + Formfeeds are assumed to use up two lines at the beginning of + the page. */ + +void +print_header () +{ + if (!use_form_feed) + fprintf (stdout, "\n\n"); + + output_position = 0; + pad_across_to (chars_per_margin); + print_white_space (); + + fprintf (stdout, "%s %d\n\n\n", header, page_number++); + + print_a_header = FALSE; + output_position = 0; +} + +/* Print (or store, if p->char_func is store_char()) a line. + + Read a character to determine whether we have a line or not. + (We may hit EOF, \n, or \f) + + Once we know we have a line, + set pad_vertically = TRUE, meaning it's safe + to pad down at the end of the page, since we do have a page. + print a header if needed. + pad across to padding_not_printed if needed. + print any separators which need to be printed. + print a line number if it needs to be printed. + + Print the clump which corresponds to the first character. + + Enter a loop and keep printing until an end of line condition + exists, or until we exceed chars_per_column. + + Return FALSE if we exceed chars_per_column before reading + an end of line character, TRUE otherwise. */ + +int +read_line (p) + COLUMN *p; +{ + register int c, chars; + int last_input_position; + + c = getc (p->fp); + + last_input_position = input_position; + switch (c) + { + case '\f': + hold_file (p); + return TRUE; + case EOF: + close_file (p); + return TRUE; + case '\n': + break; + default: + chars = char_to_clump (c); + } + + if (truncate_lines && input_position > chars_per_column) + { + input_position = last_input_position; + return FALSE; + } + + if (p->char_func != store_char) + { + pad_vertically = TRUE; + + if (print_a_header) + print_header (); + + if (padding_not_printed != ANYWHERE) + { + pad_across_to (padding_not_printed); + padding_not_printed = ANYWHERE; + } + + if (use_column_separator) + print_separators (); + } + + if (p->numbered) + number (p); + + if (c == '\n') + return TRUE; + + print_clump (p, chars, clump_buff); + + for (;;) + { + c = getc (p->fp); + + switch (c) + { + case '\n': + return TRUE; + case '\f': + hold_file (p); + return TRUE; + case EOF: + close_file (p); + return TRUE; + } + + last_input_position = input_position; + chars = char_to_clump (c); + if (truncate_lines && input_position > chars_per_column) + { + input_position = last_input_position; + return FALSE; + } + + print_clump (p, chars, clump_buff); + } +} + +/* Print a line from buff. + + If this function has been called, we know we have something to + print. Therefore we set pad_vertically to TRUE, print + a header if necessary, pad across if necessary, and print + separators if necessary. + + Return TRUE, meaning there is no need to call read_rest_of_line. */ + +int +print_stored (p) + COLUMN *p; +{ + int line = p->current_line++; + register char *first = &buff[line_vector[line]]; + register char *last = &buff[line_vector[line + 1]]; + + pad_vertically = TRUE; + + if (print_a_header) + print_header (); + + if (padding_not_printed != ANYWHERE) + { + pad_across_to (padding_not_printed); + padding_not_printed = ANYWHERE; + } + + if (use_column_separator) + print_separators (); + + while (first != last) + print_char (*first++); + + if (spaces_not_printed == 0) + output_position = p->start_position + end_vector[line]; + + return TRUE; +} + +/* Convert a character to the proper format and return the number of + characters in the resulting clump. Increment input_position by + the width of the clump. + + Tabs are converted to clumps of spaces. + Nonprintable characters may be converted to clumps of escape + sequences or control prefixes. + + Note: the width of a clump is not necessarily equal to the number of + characters in clump_buff. (e.g, the width of '\b' is -1, while the + number of characters is 1.) */ + +int +char_to_clump (c) + int c; +{ + register int *s = clump_buff; + register int i; + char esc_buff[4]; + int width; + int chars; + + if (c == input_tab_char) + { + width = tab_width (chars_per_input_tab, input_position); + + if (untabify_input) + { + for (i = width; i; --i) + *s++ = ' '; + chars = width; + } + else + { + *s = c; + chars = 1; + } + + } + else if (!ISPRINT (c)) + { + if (use_esc_sequence) + { + width = 4; + chars = 4; + *s++ = '\\'; + sprintf (esc_buff, "%03o", c); + for (i = 0; i <= 2; ++i) + *s++ = (int) esc_buff[i]; + } + else if (use_cntrl_prefix) + { + if (c < 0200) + { + width = 2; + chars = 2; + *s++ = '^'; + *s++ = c ^ 0100; + } + else + { + width = 4; + chars = 4; + *s++ = '\\'; + sprintf (esc_buff, "%03o", c); + for (i = 0; i <= 2; ++i) + *s++ = (int) esc_buff[i]; + } + } + else if (c == '\b') + { + width = -1; + chars = 1; + *s = c; + } + else + { + width = 0; + chars = 1; + *s = c; + } + } + else + { + width = 1; + chars = 1; + *s = c; + } + + input_position += width; + return chars; +} + +/* We've just printed some files and need to clean up things before + looking for more options and printing the next batch of files. + + Free everything we've xmalloc'ed, except `header'. */ + +void +cleanup () +{ + if (number_buff) + free (number_buff); + if (clump_buff) + free (clump_buff); + if (column_vector) + free (column_vector); + if (line_vector) + free (line_vector); + if (end_vector) + free (end_vector); + if (buff) + free (buff); +} + +/* Complain, print a usage message, and die. */ + +void +usage (reason) + char *reason; +{ + if (reason) + fprintf (stderr, "%s: %s\n", program_name, reason); + + fprintf (stderr, "\ +Usage: %s [+PAGE] [-COLUMN] [-abcdfFmrtv] [-e[in-tab-char[in-tab-width]]]\n\ + [-h header] [-i[out-tab-char[out-tab-width]]] [-l page-length]\n\ + [-n[number-separator[digits]]] [-o left-margin]\n\ + [-s[column-separator]] [-w page-width] [file...]\n", + program_name); + exit (2); +} diff --git a/src/sort.c b/src/sort.c new file mode 100644 index 0000000..de8b937 --- /dev/null +++ b/src/sort.c @@ -0,0 +1,1746 @@ +/* sort - sort lines of text (with all kinds of options). + Copyright (C) 1988, 1991 Free Software Foundation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Written December 1988 by Mike Haertel. + The author may be reached (Email) at the address mike@ai.mit.edu, + or (US mail) as Mike Haertel c/o Free Software Foundation. */ + +#define _GNU_SOURCE +#include +#ifndef isblank +#define isblank(c) ((c) == ' ' || (c) == '\t') +#endif +#include +#include +#include +#include "system.h" +#ifdef _POSIX_VERSION +#include +#else +#ifndef UCHAR_MAX +#define UCHAR_MAX 255 +#endif +#endif +#ifndef STDC_HEADERS +char *malloc (); +char *realloc (); +void free (); +#endif + +void error (); +static void usage (); + +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define UCHAR_LIM (UCHAR_MAX + 1) +#define UCHAR(c) ((unsigned char) (c)) + +#ifdef isascii +#define ISALNUM(c) (isascii(c) && isalnum(c)) +#define ISDIGIT(c) (isascii(c) && isdigit(c)) +#define ISPRINT(c) (isascii(c) && isprint(c)) +#define ISLOWER(c) (isascii(c) && islower(c)) +#else +#define ISALNUM(c) isalnum(c) +#define ISDIGIT(c) isdigit(c) +#define ISPRINT(c) isprint(c) +#define ISLOWER(c) islower(c) +#endif + +/* The kind of blanks for '-b' to skip in various options. */ +enum blanktype { bl_start, bl_end, bl_both }; + +/* The name this program was run with. */ +char *program_name; + +/* Table of digits. */ +static int digits[UCHAR_LIM]; + +/* Table of white space. */ +static int blanks[UCHAR_LIM]; + +/* Table of non-printing characters. */ +static int nonprinting[UCHAR_LIM]; + +/* Table of non-dictionary characters (not letters, digits, or blanks). */ +static int nondictionary[UCHAR_LIM]; + +/* Translation table folding lower case to upper. */ +static char fold_toupper[UCHAR_LIM]; + +/* Table mapping 3-letter month names to integers. + Alphabetic order allows binary search. */ +static struct month +{ + char *name; + int val; +} monthtab[] = +{ + "APR", 4, + "AUG", 8, + "DEC", 12, + "FEB", 2, + "JAN", 1, + "JUL", 7, + "JUN", 6, + "MAR", 3, + "MAY", 5, + "NOV", 11, + "OCT", 10, + "SEP", 9 +}; + +/* During the merge phase, the number of files to merge at once. */ +#define NMERGE 16 + +/* Initial buffer size for in core sorting. Will not grow unless a + line longer than this is seen. */ +static int sortalloc = 524288; + +/* Initial buffer size for in core merge buffers. Bear in mind that + up to NMERGE * mergealloc bytes may be allocated for merge buffers. */ +static int mergealloc = 16384; + +/* Guess of average line length. */ +static int linelength = 30; + +/* Maximum number of elements for the array(s) of struct line's, in bytes. */ +#define LINEALLOC 262144 + +/* Prefix for temporary file names. */ +static char *prefix; + +/* Flag to reverse the order of all comparisons. */ +static int reverse; + +/* Flag for stable sort. This turns off the last ditch bytewise + comparison of lines, and instead leaves lines in the same order + they were read if all keys compare equal. */ +static int stable; + +/* Tab character separating fields. If NUL, then fields are separated + by the empty string between a non-whitespace character and a whitespace + character. */ +static char tab; + +/* Flag to remove consecutive duplicate lines from the output. + Only the last of a sequence of equal lines will be output. */ +static int unique; + +/* Nonzero if any of the input files are the standard input. */ +static int have_read_stdin; + +/* Lines are held in core as counted strings. */ +struct line +{ + char *text; /* Text of the line. */ + int length; /* Length not including final newline. */ + char *keybeg; /* Start of first key. */ + char *keylim; /* Limit of first key. */ +}; + +/* Arrays of lines. */ +struct lines +{ + struct line *lines; /* Dynamically allocated array of lines. */ + int used; /* Number of slots used. */ + int alloc; /* Number of slots allocated. */ + int limit; /* Max number of slots to allocate. */ +}; + +/* Input buffers. */ +struct buffer +{ + char *buf; /* Dynamically allocated buffer. */ + int used; /* Number of bytes used. */ + int alloc; /* Number of bytes allocated. */ + int left; /* Number of bytes left after line parsing. */ +}; + +/* Lists of key field comparisons to be tried. */ +static struct keyfield +{ + int sword; /* Zero-origin 'word' to start at. */ + int schar; /* Additional characters to skip. */ + int skipsblanks; /* Skip leading white space at start. */ + int eword; /* Zero-origin first word after field. */ + int echar; /* Additional characters in field. */ + int skipeblanks; /* Skip trailing white space at finish. */ + int *ignore; /* Boolean array of characters to ignore. */ + char *translate; /* Translation applied to characters. */ + int numeric; /* Flag for numeric comparison. */ + int month; /* Flag for comparison by month name. */ + int reverse; /* Reverse the sense of comparison. */ + struct keyfield *next; /* Next keyfield to try. */ +} keyhead; + +/* The list of temporary files. */ +static struct tempnode +{ + char *name; + struct tempnode *next; +} temphead; + +/* Clean up any remaining temporary files. */ + +static void +cleanup () +{ + struct tempnode *node; + + for (node = temphead.next; node; node = node->next) + unlink (node->name); +} + +/* Allocate N bytes of memory dynamically, with error checking. */ + +char * +xmalloc (n) + unsigned n; +{ + char *p; + + p = malloc (n); + if (p == 0) + { + error (0, 0, "virtual memory exhausted"); + cleanup (); + exit (2); + } + return p; +} + +/* Change the size of an allocated block of memory P to N bytes, + with error checking. + If P is NULL, run xmalloc. + If N is 0, run free and return NULL. */ + +char * +xrealloc (p, n) + char *p; + unsigned n; +{ + if (p == 0) + return xmalloc (n); + if (n == 0) + { + free (p); + return 0; + } + p = realloc (p, n); + if (p == 0) + { + error (0, 0, "virtual memory exhausted"); + cleanup (); + exit (2); + } + return p; +} + +static FILE * +xfopen (file, how) + char *file, *how; +{ + FILE *fp = strcmp (file, "-") ? fopen (file, how) : stdin; + + if (fp == 0) + { + error (0, errno, "%s", file); + cleanup (); + exit (2); + } + if (fp == stdin) + have_read_stdin = 1; + return fp; +} + +static void +xfclose (fp) + FILE *fp; +{ + fflush (fp); + if (fp != stdin && fp != stdout) + { + if (fclose (fp) != 0) + { + error (0, errno, "error closing file"); + cleanup (); + exit (2); + } + } + else + /* Allow reading stdin from tty more than once. */ + clearerr (fp); +} + +static void +xfwrite (buf, size, nelem, fp) + char *buf; + int size, nelem; + FILE *fp; +{ + if (fwrite (buf, size, nelem, fp) != nelem) + { + error (0, errno, "write error"); + cleanup (); + exit (2); + } +} + +/* Return a name for a temporary file. */ + +static char * +tempname () +{ + static int seq; + int len = strlen (prefix); + char *name = xmalloc (len + 16); + struct tempnode *node = + (struct tempnode *) xmalloc (sizeof (struct tempnode)); + + if (len && prefix[len - 1] != '/') + sprintf (name, "%s/sort%5.5d%5.5d", prefix, getpid (), ++seq); + else + sprintf (name, "%ssort%5.5d%5.5d", prefix, getpid (), ++seq); + node->name = name; + node->next = temphead.next; + temphead.next = node; + return name; +} + +/* Search through the list of temporary files for NAME; + remove it if it is found on the list. */ + +static void +zaptemp (name) + char *name; +{ + struct tempnode *node, *temp; + + for (node = &temphead; node->next; node = node->next) + if (!strcmp (name, node->next->name)) + break; + if (node->next) + { + temp = node->next; + unlink (temp->name); + free (temp->name); + node->next = temp->next; + free ((char *) temp); + } +} + +/* Initialize the character class tables. */ + +static void +inittables () +{ + int i; + + for (i = 0; i < UCHAR_LIM; ++i) + { + if (isblank (i)) + blanks[i] = 1; + if (ISDIGIT (i)) + digits[i] = 1; + if (!ISPRINT (i)) + nonprinting[i] = 1; + if (!ISALNUM (i) && !isblank (i)) + nondictionary[i] = 1; + if (ISLOWER (i)) + fold_toupper[i] = toupper (i); + else + fold_toupper[i] = i; + } +} + +/* Initialize BUF, allocating ALLOC bytes initially. */ + +static void +initbuf (buf, alloc) + struct buffer *buf; + int alloc; +{ + buf->alloc = alloc; + buf->buf = xmalloc (buf->alloc); + buf->used = buf->left = 0; +} + +/* Fill BUF reading from FP, moving buf->left bytes from the end + of buf->buf to the beginning first. If EOF is reached and the + file wasn't terminated by a newline, supply one. Return a count + of bytes buffered. */ + +static int +fillbuf (buf, fp) + struct buffer *buf; + FILE *fp; +{ + int cc; + + bcopy (buf->buf + buf->used - buf->left, buf->buf, buf->left); + buf->used = buf->left; + + while (!feof (fp) && (buf->used == 0 || !memchr (buf->buf, '\n', buf->used))) + { + if (buf->used == buf->alloc) + { + buf->alloc *= 2; + buf->buf = xrealloc (buf->buf, buf->alloc); + } + cc = fread (buf->buf + buf->used, 1, buf->alloc - buf->used, fp); + if (ferror (fp)) + { + error (0, errno, "read error"); + cleanup (); + exit (2); + } + buf->used += cc; + } + + if (feof (fp) && buf->used && buf->buf[buf->used - 1] != '\n') + { + if (buf->used == buf->alloc) + { + buf->alloc *= 2; + buf->buf = xrealloc (buf->buf, buf->alloc); + } + buf->buf[buf->used++] = '\n'; + } + + return buf->used; +} + +/* Initialize LINES, allocating space for ALLOC lines initially. + LIMIT is the maximum possible number of lines to allocate space + for, ever. */ + +static void +initlines (lines, alloc, limit) + struct lines *lines; + int alloc; + int limit; +{ + lines->alloc = alloc; + lines->lines = (struct line *) xmalloc (lines->alloc * sizeof (struct line)); + lines->used = 0; + lines->limit = limit; +} + +/* Return a pointer to the first character of the field specified + by KEY in LINE. */ + +static char * +begfield (line, key) + struct line *line; + struct keyfield *key; +{ + register char *ptr = line->text, *lim = ptr + line->length; + register int sword = key->sword, schar = key->schar; + + if (tab) + while (ptr < lim && sword--) + { + while (ptr < lim && *ptr != tab) + ++ptr; + if (ptr < lim) + ++ptr; + } + else + while (ptr < lim && sword--) + { + while (ptr < lim && blanks[UCHAR (*ptr)]) + ++ptr; + while (ptr < lim && !blanks[UCHAR (*ptr)]) + ++ptr; + } + + if (key->skipsblanks) + while (ptr < lim && blanks[UCHAR (*ptr)]) + ++ptr; + + while (ptr < lim && schar--) + ++ptr; + + return ptr; +} + +/* Return the limit of (a pointer to the first character after) the field + in LINE specified by KEY. */ + +static char * +limfield (line, key) + struct line *line; + struct keyfield *key; +{ + register char *ptr = line->text, *lim = ptr + line->length; + register int eword = key->eword, echar = key->echar; + + if (tab) + while (ptr < lim && eword--) + { + while (ptr < lim && *ptr != tab) + ++ptr; + if (ptr < lim && (eword || key->skipeblanks)) + ++ptr; + } + else + while (ptr < lim && eword--) + { + while (ptr < lim && blanks[UCHAR (*ptr)]) + ++ptr; + while (ptr < lim && !blanks[UCHAR (*ptr)]) + ++ptr; + } + + if (key->skipeblanks) + while (ptr < lim && blanks[UCHAR (*ptr)]) + ++ptr; + + while (ptr < lim && echar--) + ++ptr; + + return ptr; +} + +/* Find the lines in BUF, storing pointers and lengths in LINES. + Also replace newlines with NULs. */ + +static void +findlines (buf, lines) + struct buffer *buf; + struct lines *lines; +{ + register char *beg = buf->buf, *lim = buf->buf + buf->used, *ptr; + struct keyfield *key = keyhead.next; + + lines->used = 0; + + while (beg < lim && (ptr = memchr (beg, '\n', lim - beg)) + && lines->used < lines->limit) + { + /* There are various places in the code that rely on a NUL + being at the end of in-core lines; NULs inside the lines + will not cause trouble, though. */ + *ptr = '\0'; + + if (lines->used == lines->alloc) + { + lines->alloc *= 2; + lines->lines = (struct line *) + xrealloc ((char *) lines->lines, + lines->alloc * sizeof (struct line)); + } + + lines->lines[lines->used].text = beg; + lines->lines[lines->used].length = ptr - beg; + + /* Precompute the position of the first key for efficiency. */ + if (key) + { + if (key->eword >= 0) + lines->lines[lines->used].keylim = + limfield (&lines->lines[lines->used], key); + else + lines->lines[lines->used].keylim = ptr; + + if (key->sword >= 0) + lines->lines[lines->used].keybeg = + begfield (&lines->lines[lines->used], key); + else + { + if (key->skipsblanks) + while (blanks[UCHAR (*beg)]) + ++beg; + lines->lines[lines->used].keybeg = beg; + } + } + + ++lines->used; + beg = ptr + 1; + } + + buf->left = lim - beg; +} + +/* Compare strings A and B containing decimal fractions < 1. Each string + should begin with a decimal point followed immediately by the digits + of the fraction. Strings not of this form are considered to be zero. */ + +static int +fraccompare (a, b) + register char *a, *b; +{ + register tmpa = UCHAR (*a), tmpb = UCHAR (*b); + + if (tmpa == '.' && tmpb == '.') + { + do + tmpa = UCHAR (*++a), tmpb = UCHAR (*++b); + while (tmpa == tmpb && digits[tmpa]); + if (digits[tmpa] && digits[tmpb]) + return tmpa - tmpb; + if (digits[tmpa]) + { + while (tmpa == '0') + tmpa = UCHAR (*++a); + if (digits[tmpa]) + return 1; + return 0; + } + if (digits[tmpb]) + { + while (tmpb == '0') + tmpb = UCHAR (*++b); + if (digits[tmpb]) + return -1; + return 0; + } + return 0; + } + else if (tmpa == '.') + { + do + tmpa = UCHAR (*++a); + while (tmpa == '0'); + if (digits[tmpa]) + return 1; + return 0; + } + else if (tmpb == '.') + { + do + tmpb = UCHAR (*++b); + while (tmpb == '0'); + if (digits[tmpb]) + return -1; + return 0; + } + return 0; +} + +/* Compare strings A and B as numbers without explicitly converting them to + machine numbers. Comparatively slow for short strings, but asymptotically + hideously fast. */ + +static int +numcompare (a, b) + register char *a, *b; +{ + register int tmpa, tmpb, loga, logb, tmp; + + tmpa = UCHAR (*a), tmpb = UCHAR (*b); + + if (tmpa == '-') + { + tmpa = UCHAR (*++a); + if (tmpb != '-') + { + if (digits[tmpa] && digits[tmpb]) + return -1; + return 0; + } + tmpb = UCHAR (*++b); + + while (tmpa == '0') + tmpa = UCHAR (*++a); + while (tmpb == '0') + tmpb = UCHAR (*++b); + + while (tmpa == tmpb && digits[tmpa]) + tmpa = UCHAR (*++a), tmpb = UCHAR (*++b); + + if ((tmpa == '.' && !digits[tmpb]) || (tmpb == '.' && !digits[tmpa])) + return -fraccompare (a, b); + + if (digits[tmpa]) + for (loga = 1; digits[UCHAR (*++a)]; ++loga) + ; + else + loga = 0; + + if (digits[tmpb]) + for (logb = 1; digits[UCHAR (*++b)]; ++logb) + ; + else + logb = 0; + + if (tmp = logb - loga) + return tmp; + + if (!loga) + return 0; + + return tmpb - tmpa; + } + else if (tmpb == '-') + { + if (digits[UCHAR (tmpa)] && digits[UCHAR (*++b)]) + return 1; + return 0; + } + else + { + while (tmpa == '0') + tmpa = UCHAR (*++a); + while (tmpb == '0') + tmpb = UCHAR (*++b); + + while (tmpa == tmpb && digits[tmpa]) + tmpa = UCHAR (*++a), tmpb = UCHAR (*++b); + + if ((tmpa == '.' && !digits[tmpb]) || (tmpb == '.' && !digits[tmpa])) + return fraccompare (a, b); + + if (digits[tmpa]) + for (loga = 1; digits[UCHAR (*++a)]; ++loga) + ; + else + loga = 0; + + if (digits[tmpb]) + for (logb = 1; digits[UCHAR (*++b)]; ++logb) + ; + else + logb = 0; + + if (tmp = loga - logb) + return tmp; + + if (!loga) + return 0; + + return tmpa - tmpb; + } +} + +/* Return an integer <= 12 associated with month name S with length LEN, + 0 if the name in S is not recognized. */ + +static int +getmonth (s, len) + char *s; + int len; +{ + char month[4]; + register int i, lo = 0, hi = 12; + + if (len < 3) + return 0; + + for (i = 0; i < 3; ++i) + month[i] = fold_toupper[UCHAR (s[i])]; + month[3] = '\0'; + + while (hi - lo > 1) + if (strcmp (month, monthtab[(lo + hi) / 2].name) < 0) + hi = (lo + hi) / 2; + else + lo = (lo + hi) / 2; + if (!strcmp (month, monthtab[lo].name)) + return monthtab[lo].val; + return 0; +} + +/* Compare two lines A and B trying every key in sequence until there + are no more keys or a difference is found. */ + +static int +keycompare (a, b) + struct line *a, *b; +{ + register char *texta, *textb, *lima, *limb, *translate; + register int *ignore; + struct keyfield *key; + int diff = 0, iter = 0, lena, lenb; + + for (key = keyhead.next; key; key = key->next, ++iter) + { + ignore = key->ignore; + translate = key->translate; + + /* Find the beginning and limit of each field. */ + if (iter || a->keybeg == NULL || b->keybeg == NULL) + { + if (key->eword >= 0) + lima = limfield (a, key), limb = limfield (b, key); + else + lima = a->text + a->length, limb = b->text + b->length; + + if (key->sword >= 0) + texta = begfield (a, key), textb = begfield (b, key); + else + { + texta = a->text, textb = b->text; + if (key->skipsblanks) + { + while (texta < lima && blanks[UCHAR (*texta)]) + ++texta; + while (textb < limb && blanks[UCHAR (*textb)]) + ++textb; + } + } + } + else + { + /* For the first iteration only, the key positions have + been precomputed for us. */ + texta = a->keybeg, lima = a->keylim; + textb = b->keybeg, limb = b->keylim; + } + + /* Find the lengths. */ + lena = lima - texta, lenb = limb - textb; + if (lena < 0) + lena = 0; + if (lenb < 0) + lenb = 0; + + /* Actually compare the fields. */ + if (key->numeric) + { + if (*lima || *limb) + { + char savea = *lima, saveb = *limb; + + *lima = *limb = '\0'; + diff = numcompare (texta, textb); + *lima = savea, *limb = saveb; + } + else + diff = numcompare (texta, textb); + + if (diff) + return key->reverse ? -diff : diff; + continue; + } + else if (key->month) + { + diff = getmonth (texta, lena) - getmonth (textb, lenb); + if (diff) + return key->reverse ? -diff : diff; + continue; + } + else if (ignore && translate) + while (texta < lima && textb < limb) + { + while (texta < lima && ignore[UCHAR (*texta)]) + ++texta; + while (textb < limb && ignore[UCHAR (*textb)]) + ++textb; + if (texta < lima && textb < limb && + translate[UCHAR (*texta++)] != translate[UCHAR (*textb++)]) + { + diff = translate[UCHAR (*--texta)] - translate[UCHAR (*--textb)]; + break; + } + } + else if (ignore) + while (texta < lima && textb < limb) + { + while (texta < lima && ignore[UCHAR (*texta)]) + ++texta; + while (textb < limb && ignore[UCHAR (*textb)]) + ++textb; + if (texta < lima && textb < limb && *texta++ != *textb++) + { + diff = *--texta - *--textb; + break; + } + } + else if (translate) + while (texta < lima && textb < limb) + { + if (translate[UCHAR (*texta++)] != translate[UCHAR (*textb++)]) + { + diff = translate[UCHAR (*--texta)] - translate[UCHAR (*--textb)]; + break; + } + } + else + diff = memcmp (texta, textb, min (lena, lenb)); + + if (diff) + return key->reverse ? -diff : diff; + if (diff = lena - lenb) + return key->reverse ? -diff : diff; + } + + return 0; +} + +/* Compare two lines A and B, returning negative, zero, or positive + depending on whether A compares less than, equal to, or greater than B. */ + +static int +compare (a, b) + register struct line *a, *b; +{ + int diff, tmpa, tmpb, mini; + + if (keyhead.next) + { + diff = keycompare (a, b); + if (diff) + return diff; + if (!unique && !stable) + { + tmpa = a->length, tmpb = b->length; + diff = memcmp (a->text, b->text, min (tmpa, tmpb)); + if (!diff) + diff = tmpa - tmpb; + } + } + else + { + tmpa = a->length, tmpb = b->length; + mini = min (tmpa, tmpb); + if (mini == 0) + diff = tmpa - tmpb; + else + { + char *ap = a->text, *bp = b->text; + + diff = *ap - *bp; + if (diff == 0) + { + diff = memcmp (ap, bp, mini); + if (diff == 0) + diff = tmpa - tmpb; + } + } + } + + return reverse ? -diff : diff; +} + +/* Check that the lines read from the given FP come in order. Return + 1 if they do and 0 if there is a disorder. */ + +static int +checkfp (fp) + FILE *fp; +{ + struct buffer buf; /* Input buffer. */ + struct lines lines; /* Lines scanned from the buffer. */ + struct line temp; /* Copy of previous line. */ + int cc; /* Character count. */ + int cmp; /* Result of calling compare. */ + int alloc, i, success = 1; + + initbuf (&buf, mergealloc); + initlines (&lines, mergealloc / linelength + 1, + LINEALLOC / ((NMERGE + NMERGE) * sizeof (struct line))); + alloc = linelength; + temp.text = xmalloc (alloc); + + cc = fillbuf (&buf, fp); + findlines (&buf, &lines); + + if (cc) + do + { + /* Compare each line in the buffer with its successor. */ + for (i = 0; i < lines.used - 1; ++i) + { + cmp = compare (&lines.lines[i], &lines.lines[i + 1]); + if ((unique && cmp >= 0) || (cmp > 0)) + { + success = 0; + goto finish; + } + } + + /* Save the last line of the buffer and refill the buffer. */ + if (lines.lines[lines.used - 1].length > alloc) + { + while (lines.lines[lines.used - 1].length + 1 > alloc) + alloc *= 2; + temp.text = xrealloc (temp.text, alloc); + } + bcopy (lines.lines[lines.used - 1].text, temp.text, + lines.lines[lines.used - 1].length + 1); + temp.length = lines.lines[lines.used - 1].length; + + cc = fillbuf (&buf, fp); + if (cc) + { + findlines (&buf, &lines); + /* Make sure the line saved from the old buffer contents is + less than or equal to the first line of the new buffer. */ + cmp = compare (&temp, &lines.lines[0]); + if ((unique && cmp >= 0) || (cmp > 0)) + { + success = 0; + break; + } + } + } + while (cc); + +finish: + xfclose (fp); + free (buf.buf); + free ((char *) lines.lines); + free (temp.text); + return success; +} + +/* Merge lines from FPS onto OFP. NFPS cannot be greater than NMERGE. + Close FPS before returning. */ + +static void +mergefps (fps, nfps, ofp) + FILE *fps[], *ofp; + register int nfps; +{ + struct buffer buffer[NMERGE]; /* Input buffers for each file. */ + struct lines lines[NMERGE]; /* Line tables for each buffer. */ + struct line saved; /* Saved line for unique check. */ + int savedflag = 0; /* True if there is a saved line. */ + int savealloc; /* Size allocated for the saved line. */ + int cur[NMERGE]; /* Current line in each line table. */ + int ord[NMERGE]; /* Table representing a permutation of fps, + such that lines[ord[0]].lines[cur[ord[0]]] + is the smallest line and will be next + output. */ + register int i, j, t; + + /* Allocate space for a saved line if necessary. */ + if (unique) + { + savealloc = linelength; + saved.text = xmalloc (savealloc); + } + + /* Read initial lines from each input file. */ + for (i = 0; i < nfps; ++i) + { + initbuf (&buffer[i], mergealloc); + /* If a file is empty, eliminate it from future consideration. */ + while (i < nfps && !fillbuf (&buffer[i], fps[i])) + { + xfclose (fps[i]); + --nfps; + for (j = i; j < nfps; ++j) + fps[j] = fps[j + 1]; + } + if (i == nfps) + free (buffer[i].buf); + else + { + initlines (&lines[i], mergealloc / linelength + 1, + LINEALLOC / ((NMERGE + NMERGE) * sizeof (struct line))); + findlines (&buffer[i], &lines[i]); + cur[i] = 0; + } + } + + /* Set up the ord table according to comparisons among input lines. + Since this only reorders two items if one is strictly greater than + the other, it is stable. */ + for (i = 0; i < nfps; ++i) + ord[i] = i; + for (i = 1; i < nfps; ++i) + if (compare (&lines[ord[i - 1]].lines[cur[ord[i - 1]]], + &lines[ord[i]].lines[cur[ord[i]]]) > 0) + t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0; + + /* Repeatedly output the smallest line until no input remains. */ + while (nfps) + { + /* If uniqified output is turned out, output only the first of + an identical series of lines. */ + if (unique) + { + if (savedflag && compare (&saved, &lines[ord[0]].lines[cur[ord[0]]])) + { + xfwrite (saved.text, 1, saved.length, ofp); + putc ('\n', ofp); + savedflag = 0; + } + if (!savedflag) + { + if (savealloc < lines[ord[0]].lines[cur[ord[0]]].length + 1) + { + while (savealloc < lines[ord[0]].lines[cur[ord[0]]].length + 1) + savealloc *= 2; + saved.text = xrealloc (saved.text, savealloc); + } + saved.length = lines[ord[0]].lines[cur[ord[0]]].length; + bcopy (lines[ord[0]].lines[cur[ord[0]]].text, saved.text, + saved.length + 1); + savedflag = 1; + } + } + else + { + xfwrite (lines[ord[0]].lines[cur[ord[0]]].text, 1, + lines[ord[0]].lines[cur[ord[0]]].length, ofp); + putc ('\n', ofp); + } + + /* Check if we need to read more lines into core. */ + if (++cur[ord[0]] == lines[ord[0]].used) + if (fillbuf (&buffer[ord[0]], fps[ord[0]])) + { + findlines (&buffer[ord[0]], &lines[ord[0]]); + cur[ord[0]] = 0; + } + else + { + /* We reached EOF on fps[ord[0]]. */ + for (i = 1; i < nfps; ++i) + if (ord[i] > ord[0]) + --ord[i]; + --nfps; + xfclose (fps[ord[0]]); + free (buffer[ord[0]].buf); + free ((char *) lines[ord[0]].lines); + for (i = ord[0]; i < nfps; ++i) + { + fps[i] = fps[i + 1]; + buffer[i] = buffer[i + 1]; + lines[i] = lines[i + 1]; + cur[i] = cur[i + 1]; + } + for (i = 0; i < nfps; ++i) + ord[i] = ord[i + 1]; + continue; + } + + /* The new line just read in may be larger than other lines + already in core; push it back in the queue until we encounter + a line larger than it. */ + for (i = 1; i < nfps; ++i) + { + t = compare (&lines[ord[0]].lines[cur[ord[0]]], + &lines[ord[i]].lines[cur[ord[i]]]); + if (!t) + t = ord[0] - ord[i]; + if (t < 0) + break; + } + t = ord[0]; + for (j = 1; j < i; ++j) + ord[j - 1] = ord[j]; + ord[i - 1] = t; + } + + if (unique && savedflag) + { + xfwrite (saved.text, 1, saved.length, ofp); + putc ('\n', ofp); + free (saved.text); + } +} + +/* Sort the array LINES with NLINES members, using TEMP for temporary space. */ + +static void +sortlines (lines, nlines, temp) + struct line *lines, *temp; + int nlines; +{ + register struct line *lo, *hi, *t; + register int nlo, nhi; + + if (nlines == 2) + { + if (compare (&lines[0], &lines[1]) > 0) + *temp = lines[0], lines[0] = lines[1], lines[1] = *temp; + return; + } + + nlo = nlines / 2; + lo = lines; + nhi = nlines - nlo; + hi = lines + nlo; + + if (nlo > 1) + sortlines (lo, nlo, temp); + + if (nhi > 1) + sortlines (hi, nhi, temp); + + t = temp; + + while (nlo && nhi) + if (compare (lo, hi) <= 0) + *t++ = *lo++, --nlo; + else + *t++ = *hi++, --nhi; + while (nlo--) + *t++ = *lo++; + + for (lo = lines, nlo = nlines - nhi, t = temp; nlo; --nlo) + *lo++ = *t++; +} + +/* Check that each of the NFILES FILES is ordered. + Return a count of disordered files. */ + +static int +check (files, nfiles) + char *files[]; + int nfiles; +{ + int i, disorders = 0; + FILE *fp; + + for (i = 0; i < nfiles; ++i) + { + fp = xfopen (files[i], "r"); + if (!checkfp (fp)) + { + printf ("%s: disorder on %s\n", program_name, files[i]); + ++disorders; + } + } + return disorders; +} + +/* Merge NFILES FILES onto OFP. */ + +static void +merge (files, nfiles, ofp) + char *files[]; + int nfiles; + FILE *ofp; +{ + int i, j, t; + char *temp; + FILE *fps[NMERGE], *tfp; + + while (nfiles > NMERGE) + { + t = 0; + for (i = 0; i < nfiles / NMERGE; ++i) + { + for (j = 0; j < NMERGE; ++j) + fps[j] = xfopen (files[i * NMERGE + j], "r"); + tfp = xfopen (temp = tempname (), "w"); + mergefps (fps, NMERGE, tfp); + xfclose (tfp); + for (j = 0; j < NMERGE; ++j) + zaptemp (files[i * NMERGE + j]); + files[t++] = temp; + } + for (j = 0; j < nfiles % NMERGE; ++j) + fps[j] = xfopen (files[i * NMERGE + j], "r"); + tfp = xfopen (temp = tempname (), "w"); + mergefps (fps, nfiles % NMERGE, tfp); + xfclose (tfp); + for (j = 0; j < nfiles % NMERGE; ++j) + zaptemp (files[i * NMERGE + j]); + files[t++] = temp; + nfiles = t; + } + + for (i = 0; i < nfiles; ++i) + fps[i] = xfopen (files[i], "r"); + mergefps (fps, i, ofp); + for (i = 0; i < nfiles; ++i) + zaptemp (files[i]); +} + +/* Sort NFILES FILES onto OFP. */ + +static void +sort (files, nfiles, ofp) + char **files; + int nfiles; + FILE *ofp; +{ + struct buffer buf; + struct lines lines; + struct line *tmp; + int i, ntmp; + FILE *fp, *tfp; + struct tempnode *node; + int ntemp = 0; + char **tempfiles; + + initbuf (&buf, sortalloc); + initlines (&lines, sortalloc / linelength + 1, + LINEALLOC / sizeof (struct line)); + ntmp = lines.alloc; + tmp = (struct line *) xmalloc (ntmp * sizeof (struct line)); + + while (nfiles--) + { + fp = xfopen (*files++, "r"); + while (fillbuf (&buf, fp)) + { + findlines (&buf, &lines); + if (lines.used > ntmp) + { + while (lines.used > ntmp) + ntmp *= 2; + tmp = (struct line *) + xrealloc ((char *) tmp, ntmp * sizeof (struct line)); + } + sortlines (lines.lines, lines.used, tmp); + if (feof (fp) && !nfiles && !ntemp && !buf.left) + tfp = ofp; + else + { + ++ntemp; + tfp = xfopen (tempname (), "w"); + } + for (i = 0; i < lines.used; ++i) + if (!unique || i == 0 + || compare (&lines.lines[i], &lines.lines[i - 1])) + { + xfwrite (lines.lines[i].text, 1, lines.lines[i].length, tfp); + putc ('\n', tfp); + } + if (tfp != ofp) + xfclose (tfp); + } + xfclose (fp); + } + + free (buf.buf); + free ((char *) lines.lines); + free ((char *) tmp); + + if (ntemp) + { + tempfiles = (char **) xmalloc (ntemp * sizeof (char *)); + i = ntemp; + for (node = temphead.next; node; node = node->next) + tempfiles[--i] = node->name; + merge (tempfiles, ntemp, ofp); + free ((char *) tempfiles); + } +} + +/* Insert key KEY at the end of the list (`keyhead'). */ + +static void +insertkey (key) + struct keyfield *key; +{ + struct keyfield *k = &keyhead; + + while (k->next) + k = k->next; + k->next = key; + key->next = NULL; +} + +static void +badfieldspec (s) + char *s; +{ + error (2, 0, "invalid field specification `%s'", s); +} + +/* Handle interrupts and hangups. */ + +static void +sighandler (sig) + int sig; +{ +#ifdef _POSIX_VERSION + struct sigaction sigact; + + sigact.sa_handler = SIG_DFL; + sigemptyset (&sigact.sa_mask); + sigact.sa_flags = 0; + sigaction (sig, &sigact, NULL); +#else /* !_POSIX_VERSION */ + signal (sig, SIG_DFL); +#endif /* _POSIX_VERSION */ + cleanup (); + kill (getpid (), sig); +} + +/* Set the ordering options for KEY specified in S. + Return the address of the first character in S that + is not a valid ordering option. + BLANKTYPE is the kind of blanks that 'b' should skip. */ + +static char * +set_ordering (s, key, blanktype) + register char *s; + struct keyfield *key; + enum blanktype blanktype; +{ + while (*s) + { + switch (*s) + { + case 'b': + if (blanktype == bl_start || blanktype == bl_both) + key->skipsblanks = 1; + if (blanktype == bl_end || blanktype == bl_both) + key->skipeblanks = 1; + break; + case 'd': + key->ignore = nondictionary; + break; + case 'f': + key->translate = fold_toupper; + break; +#if 0 + case 'g': + /* Reserved for comparing floating-point numbers. */ + break; +#endif + case 'i': + key->ignore = nonprinting; + break; + case 'M': + key->skipsblanks = key->skipeblanks = key->month = 1; + break; + case 'n': + key->skipsblanks = key->skipeblanks = key->numeric = 1; + break; + case 'r': + key->reverse = 1; + break; + default: + return s; + } + ++s; + } + return s; +} + +void +main (argc, argv) + int argc; + char *argv[]; +{ + struct keyfield *key = NULL, gkey; + char *s; + int i, t, t2; + int checkonly = 0, mergeonly = 0, nfiles = 0; + char *minus = "-", *outfile = minus, **files, *tmp; + FILE *ofp; +#ifdef _POSIX_VERSION + struct sigaction oldact, newact; +#endif /* _POSIX_VERSION */ + + program_name = argv[0]; + have_read_stdin = 0; + inittables (); + + prefix = getenv ("TMPDIR"); + if (prefix == NULL) + prefix = "/tmp"; + +#ifdef _POSIX_VERSION + newact.sa_handler = sighandler; + sigemptyset (&newact.sa_mask); + newact.sa_flags = 0; + + sigaction (SIGINT, NULL, &oldact); + if (oldact.sa_handler != SIG_IGN) + sigaction (SIGINT, &newact, NULL); + sigaction (SIGHUP, NULL, &oldact); + if (oldact.sa_handler != SIG_IGN) + sigaction (SIGHUP, &newact, NULL); + sigaction (SIGPIPE, NULL, &oldact); + if (oldact.sa_handler != SIG_IGN) + sigaction (SIGPIPE, &newact, NULL); + sigaction (SIGTERM, NULL, &oldact); + if (oldact.sa_handler != SIG_IGN) + sigaction (SIGTERM, &newact, NULL); +#else /* !_POSIX_VERSION */ + if (signal (SIGINT, SIG_IGN) != SIG_IGN) + signal (SIGINT, sighandler); + if (signal (SIGHUP, SIG_IGN) != SIG_IGN) + signal (SIGHUP, sighandler); + if (signal (SIGPIPE, SIG_IGN) != SIG_IGN) + signal (SIGPIPE, sighandler); + if (signal (SIGTERM, SIG_IGN) != SIG_IGN) + signal (SIGTERM, sighandler); +#endif /* !_POSIX_VERSION */ + + gkey.sword = gkey.eword = -1; + gkey.ignore = NULL; + gkey.translate = NULL; + gkey.numeric = gkey.month = gkey.reverse = 0; + gkey.skipsblanks = gkey.skipeblanks = 0; + + files = (char **) xmalloc (sizeof (char *) * argc); + + for (i = 1; i < argc; ++i) + { + if (argv[i][0] == '+') + { + if (key) + insertkey (key); + key = (struct keyfield *) xmalloc (sizeof (struct keyfield)); + key->eword = -1; + key->ignore = NULL; + key->translate = NULL; + key->skipsblanks = key->skipeblanks = 0; + key->numeric = key->month = key->reverse = 0; + s = argv[i] + 1; + if (!digits[UCHAR (*s)]) + badfieldspec (argv[i]); + for (t = 0; digits[UCHAR (*s)]; ++s) + t = 10 * t + *s - '0'; + t2 = 0; + if (*s == '.') + for (++s; digits[UCHAR (*s)]; ++s) + t2 = 10 * t2 + *s - '0'; + if (t2 || t) + { + key->sword = t; + key->schar = t2; + } + else + key->sword = -1; + s = set_ordering (s, key, bl_start); + if (*s) + badfieldspec (argv[i]); + } + else if (argv[i][0] == '-' && argv[i][1]) + { + s = argv[i] + 1; + if (digits[UCHAR (*s)]) + { + if (!key) + usage (); + for (t = 0; digits[UCHAR (*s)]; ++s) + t = t * 10 + *s - '0'; + t2 = 0; + if (*s == '.') + for (++s; digits[UCHAR (*s)]; ++s) + t2 = t2 * 10 + *s - '0'; + key->eword = t; + key->echar = t2; + s = set_ordering (s, key, bl_end); + if (*s) + badfieldspec (argv[i]); + insertkey (key); + key = NULL; + } + else + while (*s) + { + s = set_ordering (s, &gkey, bl_both); + switch (*s) + { + case '\0': + break; + case 'c': + checkonly = 1; + break; + case 'k': + if (s[1]) + ++s; + else + { + if (i == argc - 1) + error (2, 0, "option `-k' requires an argument"); + else + s = argv[++i]; + } + if (key) + insertkey (key); + key = (struct keyfield *) + xmalloc (sizeof (struct keyfield)); + key->eword = -1; + key->ignore = NULL; + key->translate = NULL; + key->skipsblanks = key->skipeblanks = 0; + key->numeric = key->month = key->reverse = 0; + /* Get POS1. */ + if (!digits[UCHAR (*s)]) + badfieldspec (argv[i]); + for (t = 0; digits[UCHAR (*s)]; ++s) + t = 10 * t + *s - '0'; + if (t) + t--; + t2 = 0; + if (*s == '.') + { + for (++s; digits[UCHAR (*s)]; ++s) + t2 = 10 * t2 + *s - '0'; + if (t2) + t2--; + } + if (t2 || t) + { + key->sword = t; + key->schar = t2; + } + else + key->sword = -1; + s = set_ordering (s, key, bl_start); + if (*s && *s != ',') + badfieldspec (argv[i]); + else if (*s++) + { + /* Get POS2. */ + for (t = 0; digits[UCHAR (*s)]; ++s) + t = t * 10 + *s - '0'; + t2 = 0; + if (*s == '.') + { + for (++s; digits[UCHAR (*s)]; ++s) + t2 = t2 * 10 + *s - '0'; + if (t2) + t--; + } + key->eword = t; + key->echar = t2; + s = set_ordering (s, key, bl_end); + if (*s) + badfieldspec (argv[i]); + } + insertkey (key); + key = NULL; + goto outer; + case 'm': + mergeonly = 1; + break; + case 'o': + if (s[1]) + outfile = s + 1; + else + { + if (i == argc - 1) + error (2, 0, "option `-o' requires an argument"); + else + outfile = argv[++i]; + } + goto outer; + case 's': + stable = 1; + break; + case 't': + if (s[1]) + tab = *++s; + else if (i < argc - 1) + { + tab = *argv[++i]; + goto outer; + } + else + error (2, 0, "option `-t' requires an argument"); + break; + case 'u': + unique = 1; + break; + default: + fprintf (stderr, "%s: unrecognized option `-%c'\n", + argv[0], *s); + usage (); + } + if (*s) + ++s; + } + } + else /* Not an option. */ + { + files[nfiles++] = argv[i]; + } + outer:; + } + + if (key) + insertkey (key); + + /* Inheritance of global options to individual keys. */ + for (key = keyhead.next; key; key = key->next) + if (!key->ignore && !key->translate && !key->skipsblanks && !key->reverse + && !key->skipeblanks && !key->month && !key->numeric) + { + key->ignore = gkey.ignore; + key->translate = gkey.translate; + key->skipsblanks = gkey.skipsblanks; + key->skipeblanks = gkey.skipeblanks; + key->month = gkey.month; + key->numeric = gkey.numeric; + key->reverse = gkey.reverse; + } + + if (!keyhead.next && (gkey.ignore || gkey.translate || gkey.skipsblanks + || gkey.reverse || gkey.skipeblanks + || gkey.month || gkey.numeric)) + insertkey (&gkey); + + if (nfiles == 0) + { + nfiles = 1; + files = − + } + + if (checkonly) + exit (check (files, nfiles) != 0); + + if (strcmp (outfile, "-")) + { + for (i = 0; i < nfiles; ++i) + if (!strcmp (outfile, files[i])) + break; + if (i == nfiles) + ofp = xfopen (outfile, "w"); + else + { + char buf[8192]; + FILE *fp = xfopen (outfile, "r"); + int cc; + + tmp = tempname (); + ofp = xfopen (tmp, "w"); + while ((cc = fread (buf, 1, sizeof buf, fp)) > 0) + xfwrite (buf, 1, cc, ofp); + if (ferror (fp)) + { + error (0, errno, "%s", outfile); + cleanup (); + exit (2); + } + xfclose (ofp); + xfclose (fp); + files[i] = tmp; + ofp = xfopen (outfile, "w"); + } + } + else + ofp = stdout; + + if (mergeonly) + merge (files, nfiles, ofp); + else + sort (files, nfiles, ofp); + cleanup (); + + if (have_read_stdin && fclose (stdin) == EOF) + error (1, errno, "-"); + if (ferror (stdout) || fclose (stdout) == EOF) + error (1, 0, "write error"); + + exit (0); +} + +static void +usage () +{ + fprintf (stderr, "\ +Usage: %s [-cmus] [-t separator] [-o output-file] [-bdfiMnr] [+POS1 [-POS2]]\n\ + [-k POS1[,POS2]] [file...]\n", + program_name); + exit (2); +} diff --git a/src/split.c b/src/split.c new file mode 100644 index 0000000..ccc4535 --- /dev/null +++ b/src/split.c @@ -0,0 +1,532 @@ +/* split.c -- split a file into pieces. + Copyright (C) 1988, 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* By tege@sics.se, with rms. + + To do: + * Implement -t CHAR or -t REGEX to specify break characters other + than newline. */ + +#include +#include +#include +#include +#include "system.h" + +char *xmalloc (); +void error (); + +int convint (); +int isdigits (); +int stdread (); +void line_bytes_split (); +void bytes_split (); +void cwrite (); +void lines_split (); +void next_file_name (); + +/* Name under which this program was invoked. */ +char *program_name; + +/* Base name of output files. */ +char *outfile; + +/* Pointer to the end of the prefix in OUTFILE. + Suffixes are inserted here. */ +char *outfile_mid; + +/* Pointer to the end of OUTFILE. */ +char *outfile_end; + +/* Status for outfile name generation. */ +unsigned outfile_count = -1; +unsigned outfile_name_limit = 25 * 26; +unsigned outfile_name_generation = 1; + +/* Name of input file. May be "-". */ +char *infile; + +/* Descriptor on which input file is open. */ +int input_desc; + +/* Descriptor on which output file is open. */ +int output_desc; + +void +usage (reason) + char *reason; +{ + if (reason != NULL) + fprintf (stderr, "%s: %s\n", program_name, reason); + fprintf (stderr, "\ +Usage: %s [-lines] [-l lines] [-b bytes[bkm]] [-C bytes[bkm]]\n\ + [--lines=lines] [--bytes=bytes[bkm]] [--line-bytes=bytes[bkm]]\n\ + [infile [outfile-prefix]]\n", + program_name); + exit (2); +} + +struct option longopts[] = +{ + {"bytes", 1, NULL, 'b'}, + {"lines", 1, NULL, 'l'}, + {"line-bytes", 1, NULL, 'C'}, + {NULL, 0, NULL, 0} +}; + +void +main (argc, argv) + int argc; + char *argv[]; +{ + struct stat stat_buf; + int num; /* numeric argument from command line */ + enum + { + type_undef, type_bytes, type_byteslines, type_lines, type_digits + } split_type = type_undef; + int in_blk_size; /* optimal block size of input file device */ + char *buf; /* file i/o buffer */ + int accum = 0; + char *outbase; + int c; + int digits_optind = 0; + + program_name = argv[0]; + + /* Parse command line options. */ + + infile = "-"; + outbase = "x"; + + while (1) + { + /* This is the argv-index of the option we will read next. */ + int this_optind = optind ? optind : 1; + + c = getopt_long (argc, argv, "0123456789b:l:C:", longopts, (int *) 0); + if (c == EOF) + break; + + switch (c) + { + case 'b': + if (split_type != type_undef) + usage ("cannot split in more than one way"); + split_type = type_bytes; + if (convint (optarg, &accum) == -1) + usage ("invalid number of bytes"); + break; + + case 'l': + if (split_type != type_undef) + usage ("cannot split in more than one way"); + split_type = type_lines; + if (!isdigits (optarg)) + usage ("invalid number of lines"); + accum = atoi (optarg); + break; + + case 'C': + if (split_type != type_undef) + usage ("cannot split in more than one way"); + split_type = type_byteslines; + if (convint (optarg, &accum) == -1) + usage ("invalid number of bytes"); + break; + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (split_type != type_undef && split_type != type_digits) + usage ("cannot split in more than one way"); + if (digits_optind != 0 && digits_optind != this_optind) + accum = 0; /* More than one number given; ignore other. */ + digits_optind = this_optind; + split_type = type_digits; + accum = accum * 10 + c - '0'; + break; + + default: + usage ((char *)0); + } + } + + /* Handle default case. */ + if (split_type == type_undef) + { + split_type = type_lines; + accum = 1000; + } + + if (accum < 1) + usage ("invalid number"); + num = accum; + + /* Get out the filename arguments. */ + + if (optind < argc) + infile = argv[optind++]; + + if (optind < argc) + outbase = argv[optind++]; + + if (optind < argc) + usage ("too many arguments"); + + /* Open the input file. */ + if (!strcmp (infile, "-")) + input_desc = 0; + else + { + input_desc = open (infile, O_RDONLY); + if (input_desc < 0) + error (1, errno, "%s", infile); + } + + /* No output file is open now. */ + output_desc = -1; + + /* Copy the output file prefix so we can add suffixes to it. + 26**29 is certainly enough output files! */ + + outfile = xmalloc (strlen (outbase) + 30); + strcpy (outfile, outbase); + outfile_mid = outfile + strlen (outfile); + outfile_end = outfile_mid + 2; + bzero (outfile_mid, 30); + outfile_mid[0] = 'a'; + outfile_mid[1] = 'a' - 1; /* first call to next_file_name makes it an 'a' */ + + /* Get the optimal block size of input device and make a buffer. */ + + if (fstat (input_desc, &stat_buf) < 0) + error (1, errno, "%s", infile); + in_blk_size = ST_BLKSIZE (stat_buf); + + buf = xmalloc (in_blk_size + 1); + + switch (split_type) + { + case type_digits: + case type_lines: + lines_split (num, buf, in_blk_size); + break; + + case type_bytes: + bytes_split (num, buf, in_blk_size); + break; + + case type_byteslines: + line_bytes_split (num); + break; + } + + if (close (input_desc) < 0) + error (1, errno, "%s", infile); + if (output_desc >= 0 && close (output_desc) < 0) + error (1, errno, "%s", outfile); + + exit (0); +} + +/* Return nonzero if the string STR is composed entirely of decimal digits. */ + +int +isdigits (str) + char *str; +{ + do + { + if (!isdigit (*str)) + return 0; + str++; + } + while (*str); + return 1; +} + +/* Put the value of the number in STR into *VAL. + STR can specify a positive integer, optionally ending in `k' + to mean kilo or `m' to mean mega. + Return 0 if STR is valid, -1 if not. */ + +int +convint (str, val) + char *str; + int *val; +{ + int multiplier = 1; + int arglen = strlen (str); + + if (arglen > 1) + { + switch (str[arglen - 1]) + { + case 'b': + multiplier = 512; + str[arglen - 1] = '\0'; + break; + case 'k': + multiplier = 1024; + str[arglen - 1] = '\0'; + break; + case 'm': + multiplier = 1048576; + str[arglen - 1] = '\0'; + break; + } + } + if (!isdigits (str)) + return -1; + *val = atoi (str) * multiplier; + return 0; +} + +/* Split into pieces of exactly NCHARS bytes. + Use buffer BUF, whose size is BUFSIZE. */ + +void +bytes_split (nchars, buf, bufsize) + int nchars; + char *buf; + int bufsize; +{ + int n_read; + int new_file_flag = 1; + int to_read; + int to_write = nchars; + char *bp_out; + + do + { + n_read = stdread (buf, bufsize); + if (n_read < 0) + error (1, errno, "%s", infile); + bp_out = buf; + to_read = n_read; + for (;;) + { + if (to_read < to_write) + { + if (to_read) /* do not write 0 bytes! */ + { + cwrite (new_file_flag, bp_out, to_read); + to_write -= to_read; + new_file_flag = 0; + } + break; + } + else + { + cwrite (new_file_flag, bp_out, to_write); + bp_out += to_write; + to_read -= to_write; + new_file_flag = 1; + to_write = nchars; + } + } + } + while (n_read == bufsize); +} + +/* Split into pieces of exactly NLINES lines. + Use buffer BUF, whose size is BUFSIZE. */ + +void +lines_split (nlines, buf, bufsize) + int nlines; + char *buf; + int bufsize; +{ + int n_read; + char *bp, *bp_out, *eob; + int new_file_flag = 1; + int n = 0; + + do + { + n_read = stdread (buf, bufsize); + if (n_read < 0) + error (1, errno, "%s", infile); + bp = bp_out = buf; + eob = bp + n_read; + *eob = '\n'; + for (;;) + { + while (*bp++ != '\n') + ; /* this semicolon takes most of the time */ + if (bp > eob) + { + if (eob != bp_out) /* do not write 0 bytes! */ + { + cwrite (new_file_flag, bp_out, eob - bp_out); + new_file_flag = 0; + } + break; + } + else + if (++n >= nlines) + { + cwrite (new_file_flag, bp_out, bp - bp_out); + bp_out = bp; + new_file_flag = 1; + n = 0; + } + } + } + while (n_read == bufsize); +} + +/* Split into pieces that are as large as possible while still not more + than NCHARS bytes, and are split on line boundaries except + where lines longer than NCHARS bytes occur. */ + +void +line_bytes_split (nchars) + int nchars; +{ + int n_read; + char *bp; + int eof = 0; + int n_buffered = 0; + char *buf = (char *) xmalloc (nchars); + + do + { + /* Fill up the full buffer size from the input file. */ + + n_read = stdread (buf + n_buffered, nchars - n_buffered); + if (n_read < 0) + error (1, errno, "%s", infile); + + n_buffered += n_read; + if (n_buffered != nchars) + eof = 1; + + /* Find where to end this chunk. */ + bp = buf + n_buffered; + if (n_buffered == nchars) + { + while (bp > buf && bp[-1] != '\n') + bp--; + } + + /* If chunk has no newlines, use all the chunk. */ + if (bp == buf) + bp = buf + n_buffered; + + /* Output the chars as one output file. */ + cwrite (1, buf, bp - buf); + + /* Discard the chars we just output; move rest of chunk + down to be the start of the next chunk. */ + n_buffered -= bp - buf; + if (n_buffered > 0) + bcopy (bp, buf, n_buffered); + } + while (!eof); + free (buf); +} + +/* Write BYTES bytes at BP to an output file. + If NEW_FILE_FLAG is nonzero, open the next output file. + Otherwise add to the same output file already in use. */ + +void +cwrite (new_file_flag, bp, bytes) + int new_file_flag; + char *bp; + int bytes; +{ + if (new_file_flag) + { + if (output_desc >= 0 && close (output_desc) < 0) + error (1, errno, "%s", outfile); + + next_file_name (); + output_desc = open (outfile, O_WRONLY | O_CREAT | O_TRUNC, 0666); + if (output_desc < 0) + error (1, errno, "%s", outfile); + } + if (write (output_desc, bp, bytes) < 0) + error (1, errno, "%s", outfile); +} + +/* Read NCHARS bytes from the input file into BUF. + Return the number of bytes successfully read. + If this is less than NCHARS, do not call `stdread' again. */ + +int +stdread (buf, nchars) + char *buf; + int nchars; +{ + int n_read; + int to_be_read = nchars; + + while (to_be_read) + { + n_read = read (input_desc, buf, to_be_read); + if (n_read < 0) + return -1; + if (n_read == 0) + break; + to_be_read -= n_read; + buf += n_read; + } + return nchars - to_be_read; +} + +/* Compute the next sequential output file name suffix and store it + into the string `outfile' at the position pointed to by `outfile_mid'. */ + +void +next_file_name () +{ + int x; + char *ne; + + outfile_count++; + if (outfile_count < outfile_name_limit) + { + for (ne = outfile_end - 1; ; ne--) + { + x = *ne; + if (x != 'z') + break; + *ne = 'a'; + } + *ne = x + 1; + return; + } + + outfile_count = 0; + outfile_name_limit *= 26; + outfile_name_generation++; + *outfile_mid++ = 'z'; + for (x = 0; x <= outfile_name_generation; x++) + outfile_mid[x] = 'a'; + outfile_end += 2; +} diff --git a/src/sum.c b/src/sum.c new file mode 100644 index 0000000..9236614 --- /dev/null +++ b/src/sum.c @@ -0,0 +1,217 @@ +/* sum -- checksum and count the blocks in a file + Copyright (C) 1986, 1989, 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* Like BSD sum or SysV sum -r, except like SysV sum if -s option is given. */ + +/* Written by Kayvan Aghaiepour and David MacKenzie. */ + +#include +#include +#include +#include "system.h" + +int bsd_sum_file (); +int sysv_sum_file (); +void error (); + +/* The name this program was run with. */ +char *program_name; + +/* Nonzero if any of the files read were the standard input. */ +int have_read_stdin; + +/* Right-rotate 32-bit integer variable C. */ +#define ROTATE_RIGHT(c) if ((c) & 01) (c) = ((c) >>1) + 0x8000; else (c) >>= 1; + +struct option longopts[] = +{ + {"sysv", 0, NULL, 's'}, + {NULL, 0, NULL, 0} +}; + +void +main (argc, argv) + int argc; + char **argv; +{ + int errors = 0; + int optc; + int files_given; + int (*sum_func) () = bsd_sum_file; + + program_name = argv[0]; + have_read_stdin = 0; + + while ((optc = getopt_long (argc, argv, "rs", longopts, (int *) 0)) != -1) + { + switch (optc) + { + case 'r': /* For SysV compatibility. */ + sum_func = bsd_sum_file; + break; + + case 's': + sum_func = sysv_sum_file; + break; + + case '?': + fprintf (stderr, "\ +Usage: %s [-rs] [--sysv] [file...]\n", argv[0]); + exit (1); + } + } + + files_given = argc - optind; + if (files_given == 0) + { + if ((*sum_func) ("-", files_given) < 0) + errors = 1; + } + else + for (; optind < argc; optind++) + if ((*sum_func) (argv[optind], files_given) < 0) + errors = 1; + + if (have_read_stdin && fclose (stdin) == EOF) + error (1, errno, "-"); + exit (errors); +} + +/* Calculate and print the rotated checksum and the size in 1K blocks + of file FILE, or of the standard input if FILE is "-". + If PRINT_NAME is >1, print FILE next to the checksum and size. + The checksum varies depending on sizeof(int). + Return 0 if successful, -1 if an error occurs. */ + +int +bsd_sum_file (file, print_name) + char *file; + int print_name; +{ + register FILE *fp; + register unsigned long checksum = 0; /* The checksum mod 2^16. */ + register long total_bytes = 0; /* The number of bytes. */ + register int ch; /* Each character read. */ + + if (!strcmp (file, "-")) + { + fp = stdin; + have_read_stdin = 1; + } + else + { + fp = fopen (file, "r"); + if (fp == NULL) + { + error (0, errno, "%s", file); + return -1; + } + } + + /* This algorithm seems to depend on sign extension in `ch' in order to + give the right results. Ick. */ + while ((ch = getc (fp)) != EOF) + { + total_bytes++; + ROTATE_RIGHT (checksum); + checksum += ch; + checksum &= 0xffff; /* Keep it within bounds. */ + } + + if (ferror (fp)) + { + error (0, errno, "%s", file); + if (strcmp (file, "-")) + fclose (fp); + return -1; + } + + if (strcmp (file, "-") && fclose (fp) == EOF) + { + error (0, errno, "%s", file); + return -1; + } + + printf ("%05lu %5ld", checksum, (total_bytes + 1024 - 1) / 1024); + if (print_name > 1) + printf (" %s", file); + putchar ('\n'); + + return 0; +} + +/* Calculate and print the checksum and the size in 512-byte blocks + of file FILE, or of the standard input if FILE is "-". + If PRINT_NAME is >0, print FILE next to the checksum and size. + Return 0 if successful, -1 if an error occurs. */ + +int +sysv_sum_file (file, print_name) + char *file; + int print_name; +{ + int fd; + unsigned char buf[8192]; + register int bytes_read; + register unsigned long checksum = 0; + long total_bytes = 0; + + if (!strcmp (file, "-")) + { + fd = 0; + have_read_stdin = 1; + } + else + { + fd = open (file, O_RDONLY); + if (fd == -1) + { + error (0, errno, "%s", file); + return -1; + } + } + + while ((bytes_read = read (fd, buf, sizeof buf)) > 0) + { + register int i; + + for (i = 0; i < bytes_read; i++) + checksum += buf[i]; + total_bytes += bytes_read; + } + + if (bytes_read < 0) + { + error (0, errno, "%s", file); + if (strcmp (file, "-")) + close (fd); + return -1; + } + + if (strcmp (file, "-") && close (fd) == -1) + { + error (0, errno, "%s", file); + return -1; + } + + printf ("%lu %ld", checksum % 0xffff, (total_bytes + 512 - 1) / 512); + if (print_name) + printf (" %s", file); + putchar ('\n'); + + return 0; +} diff --git a/src/tac.c b/src/tac.c new file mode 100644 index 0000000..78e1846 --- /dev/null +++ b/src/tac.c @@ -0,0 +1,628 @@ +/* tac - concatenate and print files in reverse + Copyright (C) 1988, 1989, 1990, 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* Written by Jay Lepreau (lepreau@cs.utah.edu). + GNU enhancements by David MacKenzie (djm@ai.mit.edu). */ + +/* Copy each FILE, or the standard input if none are given or when a + FILE name of "-" is encountered, to the standard output with the + order of the records reversed. The records are separated by + instances of a string, or a newline if none is given. By default, the + separator string is attached to the end of the record that it + follows in the file. + + Options: + -b, --before The separator is attached to the beginning + of the record that it precedes in the file. + -r, --regex The separator is a regular expression. + -s, --separator=separator Use SEPARATOR as the record separator. + + To reverse a file byte by byte, use (in bash, ksh, or sh): +tac -r -s '.\| +' file */ + +#include +#include +#include +#include +#include +#include "system.h" + +#ifndef STDC_HEADERS +char *malloc (); +char *realloc (); +#endif + +/* The number of bytes per atomic read. */ +#define INITIAL_READSIZE 8192 + +/* The number of bytes per atomic write. */ +#define WRITESIZE 8192 + +char *mktemp (); + +RETSIGTYPE cleanup (); +int tac (); +int tac_file (); +int tac_stdin (); +char *xmalloc (); +char *xrealloc (); +void output (); +void error (); +void save_stdin (); +void xwrite (); + +/* The name this program was run with. */ +char *program_name; + +/* The string that separates the records of the file. */ +char *separator; + +/* If nonzero, print `separator' along with the record preceding it + in the file; otherwise with the record following it. */ +int separator_ends_record; + +/* 0 if `separator' is to be matched as a regular expression; + otherwise, the length of `separator', used as a sentinel to + stop the search. */ +int sentinel_length; + +/* The length of a match with `separator'. If `sentinel_length' is 0, + `match_length' is computed every time a match succeeds; + otherwise, it is simply the length of `separator'. */ +int match_length; + +/* The input buffer. */ +char *buffer; + +/* The number of bytes to read at once into `buffer'. */ +unsigned read_size; + +/* The size of `buffer'. This is read_size * 2 + sentinel_length + 2. + The extra 2 bytes allow `past_end' to have a value beyond the + end of `buffer' and `match_start' to run off the front of `buffer'. */ +unsigned buffer_size; + +/* The compiled regular expression representing `separator'. */ +static struct re_pattern_buffer compiled_separator; + +struct option longopts[] = +{ + {"before", 0, &separator_ends_record, 0}, + {"regex", 0, &sentinel_length, 0}, + {"separator", 1, NULL, 's'}, + {NULL, 0, NULL, 0} +}; + +void +main (argc, argv) + int argc; + char **argv; +{ + char *error_message; /* Return value from re_compile_pattern. */ + int optc, errors; + int have_read_stdin = 0; + + program_name = argv[0]; + errors = 0; + separator = "\n"; + sentinel_length = 1; + separator_ends_record = 1; + + while ((optc = getopt_long (argc, argv, "brs:", longopts, (int *) 0)) + != EOF) + { + switch (optc) + { + case 0: + break; + case 'b': + separator_ends_record = 0; + break; + case 'r': + sentinel_length = 0; + break; + case 's': + separator = optarg; + if (*separator == 0) + error (1, 0, "separator cannot be empty"); + break; + default: + fprintf (stderr, "\ +Usage: %s [-br] [-s separator] [--before] [--regex] [--separator=separator]\n\ + [file...]\n", + program_name); + exit (1); + } + } + + if (sentinel_length == 0) + { + compiled_separator.allocated = 100; + compiled_separator.buffer = (unsigned char *) + xmalloc (compiled_separator.allocated); + compiled_separator.fastmap = xmalloc (256); + compiled_separator.translate = 0; + error_message = re_compile_pattern (separator, strlen (separator), + &compiled_separator); + if (error_message) + error (1, 0, "%s", error_message); + } + else + match_length = sentinel_length = strlen (separator); + + read_size = INITIAL_READSIZE; + /* A precaution that will probably never be needed. */ + while (sentinel_length * 2 >= read_size) + read_size *= 2; + buffer_size = read_size * 2 + sentinel_length + 2; + buffer = xmalloc (buffer_size); + if (sentinel_length) + { + strcpy (buffer, separator); + buffer += sentinel_length; + } + else + ++buffer; + + if (optind == argc) + { + have_read_stdin = 1; + errors = tac_stdin (); + } + else + for (; optind < argc; ++optind) + { + if (strcmp (argv[optind], "-") == 0) + { + have_read_stdin = 1; + errors |= tac_stdin (); + } + else + errors |= tac_file (argv[optind]); + } + + /* Flush the output buffer. */ + output ((char *) NULL, (char *) NULL); + + if (have_read_stdin && close (0) < 0) + error (1, errno, "-"); + if (close (1) < 0) + error (1, errno, "write error"); + exit (errors); +} + +/* The name of a temporary file containing a copy of pipe input. */ +char *tempfile; + +/* Print the standard input in reverse, saving it to temporary + file `tempfile' first if it is a pipe. + Return 0 if ok, 1 if an error occurs. */ + +int +tac_stdin () +{ + /* Previous values of signal handlers. */ + RETSIGTYPE (*sigint) (), (*sighup) (), (*sigpipe) (), (*sigterm) (); + int errors; + struct stat stats; +#ifdef _POSIX_VERSION + struct sigaction oldact, newact; +#endif /* _POSIX_VERSION */ + + /* No tempfile is needed for "tac < file". + Use fstat instead of checking for errno == ESPIPE because + lseek doesn't work on some special files but doesn't return an + error, either. */ + if (fstat (0, &stats)) + { + error (0, errno, "standard input"); + return 1; + } + if (S_ISREG (stats.st_mode)) + return tac (0, "standard input"); + +#ifdef _POSIX_VERSION + newact.sa_handler = cleanup; + sigemptyset (&newact.sa_mask); + newact.sa_flags = 0; + + sigaction (SIGINT, NULL, &oldact); + sigint = oldact.sa_handler; + if (sigint != SIG_IGN) + sigaction (SIGINT, &newact, NULL); + + sigaction (SIGHUP, NULL, &oldact); + sighup = oldact.sa_handler; + if (sighup != SIG_IGN) + sigaction (SIGHUP, &newact, NULL); + + sigaction (SIGPIPE, NULL, &oldact); + sigpipe = oldact.sa_handler; + if (sigpipe != SIG_IGN) + sigaction (SIGPIPE, &newact, NULL); + + sigaction (SIGTERM, NULL, &oldact); + sigterm = oldact.sa_handler; + if (sigterm != SIG_IGN) + sigaction (SIGTERM, &newact, NULL); +#else /* !_POSIX_VERSION */ + sigint = signal (SIGINT, SIG_IGN); + if (sigint != SIG_IGN) + signal (SIGINT, cleanup); + + sighup = signal (SIGHUP, SIG_IGN); + if (sighup != SIG_IGN) + signal (SIGHUP, cleanup); + + sigpipe = signal (SIGPIPE, SIG_IGN); + if (sigpipe != SIG_IGN) + signal (SIGPIPE, cleanup); + + sigterm = signal (SIGTERM, SIG_IGN); + if (sigterm != SIG_IGN) + signal (SIGTERM, cleanup); +#endif /* _POSIX_VERSION */ + + save_stdin (); + + errors = tac_file (tempfile); + + unlink (tempfile); + +#ifdef _POSIX_VERSION + newact.sa_handler = sigint; + sigaction (SIGINT, &newact, NULL); + newact.sa_handler = sighup; + sigaction (SIGHUP, &newact, NULL); + newact.sa_handler = sigterm; + sigaction (SIGTERM, &newact, NULL); + newact.sa_handler = sigpipe; + sigaction (SIGPIPE, &newact, NULL); +#else /* !_POSIX_VERSION */ + signal (SIGINT, sigint); + signal (SIGHUP, sighup); + signal (SIGTERM, sigterm); + signal (SIGPIPE, sigpipe); +#endif /* _POSIX_VERSION */ + + return errors; +} + +/* Make a copy of the standard input in `tempfile'. */ + +void +save_stdin () +{ + static char *template = NULL; + static char *tempdir; + int fd; + int bytes_read; + + if (template == NULL) + { + tempdir = getenv ("TMPDIR"); + if (tempdir == NULL) + tempdir = "/tmp"; + template = xmalloc (strlen (tempdir) + 11); + } + sprintf (template, "%s/tacXXXXXX", tempdir); + tempfile = mktemp (template); + + fd = creat (tempfile, 0600); + if (fd == -1) + { + error (0, errno, "%s", tempfile); + cleanup (); + } + while ((bytes_read = read (0, buffer, read_size)) > 0) + if (write (fd, buffer, bytes_read) != bytes_read) + { + error (0, errno, "%s", tempfile); + cleanup (); + } + if (close (fd) < 0) + { + error (0, errno, "%s", tempfile); + cleanup (); + } + if (bytes_read == -1) + { + error (0, errno, "read error"); + cleanup (); + } +} + +/* Print FILE in reverse. + Return 0 if ok, 1 if an error occurs. */ + +int +tac_file (file) + char *file; +{ + int fd, errors; + + fd = open (file, 0); + if (fd == -1) + { + error (0, errno, "%s", file); + return 1; + } + errors = tac (fd, file); + if (close (fd) < 0) + { + error (0, errno, "%s", file); + return 1; + } + return errors; +} + +/* Print in reverse the file open on descriptor FD for reading FILE. + Return 0 if ok, 1 if an error occurs. */ + +int +tac (fd, file) + int fd; + char *file; +{ + /* Pointer to the location in `buffer' where the search for + the next separator will begin. */ + char *match_start; + /* Pointer to one past the rightmost character in `buffer' that + has not been printed yet. */ + char *past_end; + unsigned saved_record_size; /* Length of the record growing in `buffer'. */ + off_t file_pos; /* Offset in the file of the next read. */ + /* Nonzero if `output' has not been called yet for any file. + Only used when the separator is attached to the preceding record. */ + int first_time = 1; + char first_char = *separator; /* Speed optimization, non-regexp. */ + char *separator1 = separator + 1; /* Speed optimization, non-regexp. */ + int match_length1 = match_length - 1; /* Speed optimization, non-regexp. */ + struct re_registers regs; + + /* Find the size of the input file. */ + file_pos = lseek (fd, (off_t) 0, SEEK_END); + if (file_pos < 1) + return 0; /* It's an empty file. */ + + /* Arrange for the first read to lop off enough to leave the rest of the + file a multiple of `read_size'. Since `read_size' can change, this may + not always hold during the program run, but since it usually will, leave + it here for i/o efficiency (page/sector boundaries and all that). + Note: the efficiency gain has not been verified. */ + saved_record_size = file_pos % read_size; + if (saved_record_size == 0) + saved_record_size = read_size; + file_pos -= saved_record_size; + /* `file_pos' now points to the start of the last (probably partial) block + in the input file. */ + + lseek (fd, file_pos, SEEK_SET); + if (read (fd, buffer, saved_record_size) != saved_record_size) + { + error (0, 1, "%s", file); + return 1; + } + + match_start = past_end = buffer + saved_record_size; + /* For non-regexp search, move past impossible positions for a match. */ + if (sentinel_length) + match_start -= match_length1; + + for (;;) + { + /* Search backward from `match_start' - 1 to `buffer' for a match + with `separator'; for speed, use strncmp if `separator' contains no + metacharacters. + If the match succeeds, set `match_start' to point to the start of + the match and `match_length' to the length of the match. + Otherwise, make `match_start' < `buffer'. */ + if (sentinel_length == 0) + { + int i = match_start - buffer; + int ret; + + ret = re_search (&compiled_separator, buffer, i, i - 1, -i, ®s); + if (ret == -1) + match_start = buffer - 1; + else if (ret == -2) + { + error (0, 0, "error in regular expression search"); + cleanup (); + } + else + { + match_start = buffer + regs.start[0]; + match_length = regs.end[0] - regs.start[0]; + } + } + else + { + /* `match_length' is constant for non-regexp boundaries. */ + while (*--match_start != first_char + || (match_length1 && strncmp (match_start + 1, separator1, + match_length1))) + /* Do nothing. */ ; + } + + /* Check whether we backed off the front of `buffer' without finding + a match for `separator'. */ + if (match_start < buffer) + { + if (file_pos == 0) + { + /* Hit the beginning of the file; print the remaining record. */ + output (buffer, past_end); + return 0; + } + + saved_record_size = past_end - buffer; + if (saved_record_size > read_size) + { + /* `buffer_size' is about twice `read_size', so since + we want to read in another `read_size' bytes before + the data already in `buffer', we need to increase + `buffer_size'. */ + char *newbuffer; + int offset = sentinel_length ? sentinel_length : 1; + + read_size *= 2; + buffer_size = read_size * 2 + sentinel_length + 2; + newbuffer = xrealloc (buffer - offset, buffer_size) + offset; + /* Adjust the pointers for the new buffer location. */ + match_start += newbuffer - buffer; + past_end += newbuffer - buffer; + buffer = newbuffer; + } + + /* Back up to the start of the next bufferfull of the file. */ + if (file_pos >= read_size) + file_pos -= read_size; + else + { + read_size = file_pos; + file_pos = 0; + } + lseek (fd, file_pos, SEEK_SET); + + /* Shift the pending record data right to make room for the new. */ + bcopy (buffer, buffer + read_size, saved_record_size); + past_end = buffer + read_size + saved_record_size; + /* For non-regexp searches, avoid unneccessary scanning. */ + if (sentinel_length) + match_start = buffer + read_size; + else + match_start = past_end; + + if (read (fd, buffer, read_size) != read_size) + { + error (0, errno, "%s", file); + return 1; + } + } + else + { + /* Found a match of `separator'. */ + if (separator_ends_record) + { + char *match_end = match_start + match_length; + + /* If this match of `separator' isn't at the end of the + file, print the record. */ + if (first_time == 0 || match_end != past_end) + output (match_end, past_end); + past_end = match_end; + first_time = 0; + } + else + { + output (match_start, past_end); + past_end = match_start; + } + match_start -= match_length - 1; + } + } +} + +/* Print the characters from START to PAST_END - 1. + If START is NULL, just flush the buffer. */ + +void +output (start, past_end) + char *start; + char *past_end; +{ + static char buffer[WRITESIZE]; + static int bytes_in_buffer = 0; + int bytes_to_add = past_end - start; + int bytes_available = WRITESIZE - bytes_in_buffer; + + if (start == 0) + { + xwrite (1, buffer, bytes_in_buffer); + bytes_in_buffer = 0; + return; + } + + /* Write out as many full buffers as possible. */ + while (bytes_to_add >= bytes_available) + { + bcopy (start, buffer + bytes_in_buffer, bytes_available); + bytes_to_add -= bytes_available; + start += bytes_available; + xwrite (1, buffer, WRITESIZE); + bytes_in_buffer = 0; + bytes_available = WRITESIZE; + } + + bcopy (start, buffer + bytes_in_buffer, bytes_to_add); + bytes_in_buffer += bytes_to_add; +} + +RETSIGTYPE +cleanup () +{ + unlink (tempfile); + exit (1); +} + +void +xwrite (desc, buffer, size) + int desc; + char *buffer; + int size; +{ + if (write (desc, buffer, size) != size) + { + error (0, errno, "write error"); + cleanup (); + } +} + +/* Allocate N bytes of memory dynamically, with error checking. */ + +char * +xmalloc (n) + unsigned n; +{ + char *p; + + p = malloc (n); + if (p == 0) + { + error (0, 0, "virtual memory exhausted"); + cleanup (); + } + return p; +} + +/* Change the size of memory area P to N bytes, with error checking. */ + +char * +xrealloc (p, n) + char *p; + unsigned n; +{ + p = realloc (p, n); + if (p == 0) + { + error (0, 0, "virtual memory exhausted"); + cleanup (); + } + return p; +} diff --git a/src/tail.c b/src/tail.c new file mode 100644 index 0000000..050c193 --- /dev/null +++ b/src/tail.c @@ -0,0 +1,858 @@ +/* tail -- output last part of file(s) + Copyright (C) 1989, 1990, 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* Can display any amount of data, unlike the Unix version, which uses + a fixed size buffer and therefore can only deliver a limited number + of lines. + + Options: + -b Tail by N 512-byte blocks. + -c, --bytes=N[bkm] Tail by N bytes + [or 512-byte blocks, kilobytes, or megabytes]. + -f, --follow Loop forever trying to read more characters at the + end of the file, on the assumption that the file + is growing. Ignored if reading from a pipe. + Cannot be used if more than one file is given. + -k Tail by N kilobytes. + -N, -l, -n, --lines=N Tail by N lines. + -m Tail by N megabytes. + -q, --quiet, --silent Never print filename headers. + -v, --verbose Always print filename headers. + + If a number (N) starts with a `+', begin printing with the Nth item + from the start of each file, instead of from the end. + + Reads from standard input if no files are given or when a filename of + ``-'' is encountered. + By default, filename headers are printed only more than one file + is given. + By default, prints the last 10 lines (tail -n 10). + + Original version by Paul Rubin . + Extensions by David MacKenzie . */ + +#include +#include +#include +#include +#include "system.h" + +#ifdef isascii +#define ISDIGIT(c) (isascii ((c)) && isdigit ((c))) +#else +#define ISDIGIT(c) (isdigit ((c))) +#endif + +/* Number of items to tail. */ +#define DEFAULT_NUMBER 10 + +/* Size of atomic reads. */ +#define BUFSIZE (512 * 8) + +/* Number of bytes per item we are printing. + If 0, tail in lines. */ +int unit_size; + +/* If nonzero, read from end of file until killed. */ +int forever; + +/* If nonzero, count from start of file instead of end. */ +int from_start; + +/* If nonzero, print filename headers. */ +int print_headers; + +/* When to print the filename banners. */ +enum header_mode +{ + multiple_files, always, never +}; + +char *xmalloc (); +int file_lines (); +int pipe_bytes (); +int pipe_lines (); +int start_bytes (); +int start_lines (); +int tail (); +int tail_bytes (); +int tail_file (); +int tail_lines (); +long atou(); +void dump_remainder (); +void error (); +void parse_unit (); +void usage (); +void write_header (); +void xwrite (); + +/* The name this program was run with. */ +char *program_name; + +/* Nonzero if we have ever read standard input. */ +int have_read_stdin; + +struct option long_options[] = +{ + {"bytes", 1, NULL, 'c'}, + {"follow", 0, NULL, 'f'}, + {"lines", 1, NULL, 'n'}, + {"quiet", 0, NULL, 'q'}, + {"silent", 0, NULL, 'q'}, + {"verbose", 0, NULL, 'v'}, + {NULL, 0, NULL, 0} +}; + +void +main (argc, argv) + int argc; + char **argv; +{ + enum header_mode header_mode = multiple_files; + int exit_status = 0; + /* If from_start, the number of items to skip before printing; otherwise, + the number of items at the end of the file to print. Initially, -1 + means the value has not been set. */ + long number = -1; + int c; /* Option character. */ + + program_name = argv[0]; + have_read_stdin = 0; + unit_size = 0; + forever = from_start = print_headers = 0; + + if (argc > 1 + && ((argv[1][0] == '-' && ISDIGIT (argv[1][1])) + || (argv[1][0] == '+' && (ISDIGIT (argv[1][1]) || argv[1][1] == 0)))) + { + /* Old option syntax: a dash or plus, one or more digits (zero digits + are acceptable with a plus), and one or more option letters. */ + if (argv[1][0] == '+') + from_start = 1; + if (argv[1][1] != 0) + { + for (number = 0, ++argv[1]; ISDIGIT (*argv[1]); ++argv[1]) + number = number * 10 + *argv[1] - '0'; + /* Parse any appended option letters. */ + while (*argv[1]) + { + switch (*argv[1]) + { + case 'b': + unit_size = 512; + break; + + case 'c': + unit_size = 1; + break; + + case 'f': + forever = 1; + break; + + case 'k': + unit_size = 1024; + break; + + case 'l': + unit_size = 0; + break; + + case 'm': + unit_size = 1048576; + break; + + case 'q': + header_mode = never; + break; + + case 'v': + header_mode = always; + break; + + default: + error (0, 0, "unrecognized option `-%c'", *argv[1]); + usage (); + } + ++argv[1]; + } + } + /* Make the options we just parsed invisible to getopt. */ + argv[1] = argv[0]; + argv++; + argc--; + } + + while ((c = getopt_long (argc, argv, "c:n:fqv", long_options, (int *) 0)) + != EOF) + { + switch (c) + { + case 'c': + unit_size = 1; + parse_unit (optarg); + goto getnum; + case 'n': + unit_size = 0; + getnum: + if (*optarg == '+') + { + from_start = 1; + ++optarg; + } + else if (*optarg == '-') + ++optarg; + number = atou (optarg); + if (number == -1) + error (1, 0, "invalid number `%s'", optarg); + break; + + case 'f': + forever = 1; + break; + + case 'q': + header_mode = never; + break; + + case 'v': + header_mode = always; + break; + + default: + usage (); + } + } + + if (number == -1) + number = DEFAULT_NUMBER; + + /* To start printing with item `number' from the start of the file, skip + `number' - 1 items. `tail +0' is actually meaningless, but for Unix + compatibility it's treated the same as `tail +1'. */ + if (from_start) + { + if (number) + --number; + } + + if (unit_size > 1) + number *= unit_size; + + if (optind < argc - 1 && forever) + error (1, 0, "cannot follow the ends of multiple files"); + + if (header_mode == always + || (header_mode == multiple_files && optind < argc - 1)) + print_headers = 1; + + if (optind == argc) + exit_status |= tail_file ("-", number); + + for (; optind < argc; ++optind) + exit_status |= tail_file (argv[optind], number); + + if (have_read_stdin && close (0) < 0) + error (1, errno, "-"); + if (close (1) < 0) + error (1, errno, "write error"); + exit (exit_status); +} + +/* Display the last NUMBER units of file FILENAME. + "-" for FILENAME means the standard input. + Return 0 if successful, 1 if an error occurred. */ + +int +tail_file (filename, number) + char *filename; + long number; +{ + int fd; + + if (!strcmp (filename, "-")) + { + have_read_stdin = 1; + filename = "standard input"; + if (print_headers) + write_header (filename); + return tail (filename, 0, number); + } + else + { + fd = open (filename, O_RDONLY); + if (fd >= 0) + { + int errors; + + if (print_headers) + write_header (filename); + errors = tail (filename, fd, number); + if (close (fd) == 0) + return errors; + } + error (0, errno, "%s", filename); + return 1; + } +} + +void +write_header (filename) + char *filename; +{ + static int first_file = 1; + + if (first_file) + { + xwrite (1, "==> ", 4); + first_file = 0; + } + else + xwrite (1, "\n==> ", 5); + xwrite (1, filename, strlen (filename)); + xwrite (1, " <==\n", 5); +} + +/* Display the last NUMBER units of file FILENAME, open for reading + in FD. + Return 0 if successful, 1 if an error occurred. */ + +int +tail (filename, fd, number) + char *filename; + int fd; + long number; +{ + if (unit_size) + return tail_bytes (filename, fd, number); + else + return tail_lines (filename, fd, number); +} + +/* Display the last part of file FILENAME, open for reading in FD, + using NUMBER characters. + Return 0 if successful, 1 if an error occurred. */ + +int +tail_bytes (filename, fd, number) + char *filename; + int fd; + long number; +{ + struct stat stats; + + /* Use fstat instead of checking for errno == ESPIPE because + lseek doesn't work on some special files but doesn't return an + error, either. */ + if (fstat (fd, &stats)) + { + error (0, errno, "%s", filename); + return 1; + } + + if (from_start) + { + if (S_ISREG (stats.st_mode)) + lseek (fd, number, SEEK_SET); + else if (start_bytes (filename, fd, number)) + return 1; + dump_remainder (filename, fd); + } + else + { + if (S_ISREG (stats.st_mode)) + { + if (lseek (fd, 0L, SEEK_END) <= number) + /* The file is shorter than we want, or just the right size, so + print the whole file. */ + lseek (fd, 0L, SEEK_SET); + else + /* The file is longer than we want, so go back. */ + lseek (fd, -number, SEEK_END); + dump_remainder (filename, fd); + } + else + return pipe_bytes (filename, fd, number); + } + return 0; +} + +/* Display the last part of file FILENAME, open for reading on FD, + using NUMBER lines. + Return 0 if successful, 1 if an error occurred. */ + +int +tail_lines (filename, fd, number) + char *filename; + int fd; + long number; +{ + struct stat stats; + long length; + + if (fstat (fd, &stats)) + { + error (0, errno, "%s", filename); + return 1; + } + + if (from_start) + { + if (start_lines (filename, fd, number)) + return 1; + dump_remainder (filename, fd); + } + else + { + if (S_ISREG (stats.st_mode)) + { + length = lseek (fd, 0L, SEEK_END); + if (length != 0 && file_lines (filename, fd, number, length)) + return 1; + dump_remainder (filename, fd); + } + else + return pipe_lines (filename, fd, number); + } + return 0; +} + +/* Print the last NUMBER lines from the end of file FD. + Go backward through the file, reading `BUFSIZE' bytes at a time (except + probably the first), until we hit the start of the file or have + read NUMBER newlines. + POS starts out as the length of the file (the offset of the last + byte of the file + 1). + Return 0 if successful, 1 if an error occurred. */ + +int +file_lines (filename, fd, number, pos) + char *filename; + int fd; + long number; + long pos; +{ + char buffer[BUFSIZE]; + int bytes_read; + int i; /* Index into `buffer' for scanning. */ + + if (number == 0) + return 0; + + /* Set `bytes_read' to the size of the last, probably partial, buffer; + 0 < `bytes_read' <= `BUFSIZE'. */ + bytes_read = pos % BUFSIZE; + if (bytes_read == 0) + bytes_read = BUFSIZE; + /* Make `pos' a multiple of `BUFSIZE' (0 if the file is short), so that all + reads will be on block boundaries, which might increase efficiency. */ + pos -= bytes_read; + lseek (fd, pos, SEEK_SET); + bytes_read = read (fd, buffer, bytes_read); + if (bytes_read == -1) + { + error (0, errno, "%s", filename); + return 1; + } + + /* Count the incomplete line on files that don't end with a newline. */ + if (bytes_read && buffer[bytes_read - 1] != '\n') + --number; + + do + { + /* Scan backward, counting the newlines in this bufferfull. */ + for (i = bytes_read - 1; i >= 0; i--) + { + /* Have we counted the requested number of newlines yet? */ + if (buffer[i] == '\n' && number-- == 0) + { + /* If this newline wasn't the last character in the buffer, + print the text after it. */ + if (i != bytes_read - 1) + xwrite (1, &buffer[i + 1], bytes_read - (i + 1)); + return 0; + } + } + /* Not enough newlines in that bufferfull. */ + if (pos == 0) + { + /* Not enough lines in the file; print the entire file. */ + lseek (fd, 0L, SEEK_SET); + return 0; + } + pos -= BUFSIZE; + lseek (fd, pos, SEEK_SET); + } + while ((bytes_read = read (fd, buffer, BUFSIZE)) > 0); + if (bytes_read == -1) + { + error (0, errno, "%s", filename); + return 1; + } + return 0; +} + +/* Print the last NUMBER lines from the end of the standard input, + open for reading as pipe FD. + Buffer the text as a linked list of LBUFFERs, adding them as needed. + Return 0 if successful, 1 if an error occured. */ + +int +pipe_lines (filename, fd, number) + char *filename; + int fd; + long number; +{ + struct linebuffer + { + int nbytes, nlines; + char buffer[BUFSIZE]; + struct linebuffer *next; + }; + typedef struct linebuffer LBUFFER; + LBUFFER *first, *last, *tmp; + int i; /* Index into buffers. */ + int total_lines = 0; /* Total number of newlines in all buffers. */ + int errors = 0; + + first = last = (LBUFFER *) xmalloc (sizeof (LBUFFER)); + first->nbytes = first->nlines = 0; + first->next = NULL; + tmp = (LBUFFER *) xmalloc (sizeof (LBUFFER)); + + /* Input is always read into a fresh buffer. */ + while ((tmp->nbytes = read (fd, tmp->buffer, BUFSIZE)) > 0) + { + tmp->nlines = 0; + tmp->next = NULL; + + /* Count the number of newlines just read. */ + for (i = 0; i < tmp->nbytes; i++) + if (tmp->buffer[i] == '\n') + ++tmp->nlines; + total_lines += tmp->nlines; + + /* If there is enough room in the last buffer read, just append the new + one to it. This is because when reading from a pipe, `nbytes' can + often be very small. */ + if (tmp->nbytes + last->nbytes < BUFSIZE) + { + bcopy (tmp->buffer, &last->buffer[last->nbytes], tmp->nbytes); + last->nbytes += tmp->nbytes; + last->nlines += tmp->nlines; + } + else + { + /* If there's not enough room, link the new buffer onto the end of + the list, then either free up the oldest buffer for the next + read if that would leave enough lines, or else malloc a new one. + Some compaction mechanism is possible but probably not + worthwhile. */ + last = last->next = tmp; + if (total_lines - first->nlines > number) + { + tmp = first; + total_lines -= first->nlines; + first = first->next; + } + else + tmp = (LBUFFER *) xmalloc (sizeof (LBUFFER)); + } + } + if (tmp->nbytes == -1) + { + error (0, errno, "%s", filename); + errors = 1; + free ((char *) tmp); + goto free_lbuffers; + } + + free ((char *) tmp); + + /* This prevents a core dump when the pipe contains no newlines. */ + if (number == 0) + goto free_lbuffers; + + /* Count the incomplete line on files that don't end with a newline. */ + if (last->buffer[last->nbytes - 1] != '\n') + { + ++last->nlines; + ++total_lines; + } + + /* Run through the list, printing lines. First, skip over unneeded + buffers. */ + for (tmp = first; total_lines - tmp->nlines > number; tmp = tmp->next) + total_lines -= tmp->nlines; + + /* Find the correct beginning, then print the rest of the file. */ + if (total_lines > number) + { + char *cp; + + /* Skip `total_lines' - `number' newlines. We made sure that + `total_lines' - `number' <= `tmp->nlines'. */ + cp = tmp->buffer; + for (i = total_lines - number; i; --i) + while (*cp++ != '\n') + /* Do nothing. */ ; + i = cp - tmp->buffer; + } + else + i = 0; + xwrite (1, &tmp->buffer[i], tmp->nbytes - i); + + for (tmp = tmp->next; tmp; tmp = tmp->next) + xwrite (1, tmp->buffer, tmp->nbytes); + +free_lbuffers: + while (first) + { + tmp = first->next; + free ((char *) first); + first = tmp; + } + return errors; +} + +/* Print the last NUMBER characters from the end of pipe FD. + This is a stripped down version of pipe_lines. + Return 0 if successful, 1 if an error occurred. */ + +int +pipe_bytes (filename, fd, number) + char *filename; + int fd; + long number; +{ + struct charbuffer + { + int nbytes; + char buffer[BUFSIZE]; + struct charbuffer *next; + }; + typedef struct charbuffer CBUFFER; + CBUFFER *first, *last, *tmp; + int i; /* Index into buffers. */ + int total_bytes = 0; /* Total characters in all buffers. */ + int errors = 0; + + first = last = (CBUFFER *) xmalloc (sizeof (CBUFFER)); + first->nbytes = 0; + first->next = NULL; + tmp = (CBUFFER *) xmalloc (sizeof (CBUFFER)); + + /* Input is always read into a fresh buffer. */ + while ((tmp->nbytes = read (fd, tmp->buffer, BUFSIZE)) > 0) + { + tmp->next = NULL; + + total_bytes += tmp->nbytes; + /* If there is enough room in the last buffer read, just append the new + one to it. This is because when reading from a pipe, `nbytes' can + often be very small. */ + if (tmp->nbytes + last->nbytes < BUFSIZE) + { + bcopy (tmp->buffer, &last->buffer[last->nbytes], tmp->nbytes); + last->nbytes += tmp->nbytes; + } + else + { + /* If there's not enough room, link the new buffer onto the end of + the list, then either free up the oldest buffer for the next + read if that would leave enough characters, or else malloc a new + one. Some compaction mechanism is possible but probably not + worthwhile. */ + last = last->next = tmp; + if (total_bytes - first->nbytes > number) + { + tmp = first; + total_bytes -= first->nbytes; + first = first->next; + } + else + { + tmp = (CBUFFER *) xmalloc (sizeof (CBUFFER)); + } + } + } + if (tmp->nbytes == -1) + { + error (0, errno, "%s", filename); + errors = 1; + free ((char *) tmp); + goto free_cbuffers; + } + + free ((char *) tmp); + + /* Run through the list, printing characters. First, skip over unneeded + buffers. */ + for (tmp = first; total_bytes - tmp->nbytes > number; tmp = tmp->next) + total_bytes -= tmp->nbytes; + + /* Find the correct beginning, then print the rest of the file. + We made sure that `total_bytes' - `number' <= `tmp->nbytes'. */ + if (total_bytes > number) + i = total_bytes - number; + else + i = 0; + xwrite (1, &tmp->buffer[i], tmp->nbytes - i); + + for (tmp = tmp->next; tmp; tmp = tmp->next) + xwrite (1, tmp->buffer, tmp->nbytes); + +free_cbuffers: + while (first) + { + tmp = first->next; + free ((char *) first); + first = tmp; + } + return errors; +} + +/* Skip NUMBER characters from the start of pipe FD, and print + any extra characters that were read beyond that. + Return 1 on error, 0 if ok. */ + +int +start_bytes (filename, fd, number) + char *filename; + int fd; + long number; +{ + char buffer[BUFSIZE]; + int bytes_read = 0; + + while (number > 0 && (bytes_read = read (fd, buffer, BUFSIZE)) > 0) + number -= bytes_read; + if (bytes_read == -1) + { + error (0, errno, "%s", filename); + return 1; + } + else if (number < 0) + xwrite (1, &buffer[bytes_read + number], -number); + return 0; +} + +/* Skip NUMBER lines at the start of file or pipe FD, and print + any extra characters that were read beyond that. + Return 1 on error, 0 if ok. */ + +int +start_lines (filename, fd, number) + char *filename; + int fd; + long number; +{ + char buffer[BUFSIZE]; + int bytes_read = 0; + int bytes_to_skip = 0; + + while (number && (bytes_read = read (fd, buffer, BUFSIZE)) > 0) + { + bytes_to_skip = 0; + while (bytes_to_skip < bytes_read) + if (buffer[bytes_to_skip++] == '\n' && --number == 0) + break; + } + if (bytes_read == -1) + { + error (0, errno, "%s", filename); + return 1; + } + else if (bytes_to_skip < bytes_read) + xwrite (1, &buffer[bytes_to_skip], bytes_read - bytes_to_skip); + return 0; +} + +/* Display file FILENAME from the current position in FD + to the end. If `forever' is nonzero, keep reading from the + end of the file until killed. */ + +void +dump_remainder (filename, fd) + char *filename; + int fd; +{ + char buffer[BUFSIZE]; + int bytes_read; + +output: + while ((bytes_read = read (fd, buffer, BUFSIZE)) > 0) + xwrite (1, buffer, bytes_read); + if (bytes_read == -1) + error (1, errno, "%s", filename); + if (forever) + { + sleep (1); + goto output; + } +} + +void +parse_unit (str) + char *str; +{ + int arglen = strlen (str); + + if (arglen == 0) + return; + + switch (str[arglen - 1]) + { + case 'b': + unit_size = 512; + str[arglen - 1] = '\0'; + break; + case 'k': + unit_size = 1024; + str[arglen - 1] = '\0'; + break; + case 'm': + unit_size = 1048576; + str[arglen - 1] = '\0'; + break; + } +} + +/* Convert STR, a string of ASCII digits, into an unsigned integer. + Return -1 if STR does not represent a valid unsigned integer. */ + +long +atou (str) + char *str; +{ + unsigned long value; + + for (value = 0; ISDIGIT (*str); ++str) + value = value * 10 + *str - '0'; + return *str ? -1 : value; +} + +void +usage () +{ + fprintf (stderr, "\ +Usage: %s [-c [+]N[bkm]] [-n [+]N] [-fqv] [--bytes=[+]N[bkm]] [--lines=[+]N]\n\ + [--follow] [--quiet] [--silent] [--verbose] [file...]\n\ + %s [{-,+}Nbcfklmqv] [file...]\n", program_name, program_name); + exit (1); +} diff --git a/src/tr.c b/src/tr.c new file mode 100644 index 0000000..bd12f38 --- /dev/null +++ b/src/tr.c @@ -0,0 +1,1813 @@ +/* tr -- a filter to translate characters + Copyright (C) 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* Written by Jim Meyering. */ + +#define _GNU_SOURCE +#include +#ifndef isblank +#define isblank(c) ((c) == ' ' || (c) == '\t') +#endif +#ifndef isgraph +#define isgraph(c) (isprint (c) && !isspace (c)) +#endif +#include +#include +#include +#include +#include "getopt.h" +#include "system.h" + +#ifndef LONG_MAX +#define LONG_MAX 0x7FFFFFFF +#endif + +#ifndef UCHAR_MAX +#define UCHAR_MAX 0xFF +#endif + +#define N_CHARS (UCHAR_MAX + 1) + +/* A pointer to a function that returns an int. */ +typedef int (*PFI) (); + +/* Convert from character C to its index in the collating + sequence array. Just cast to an unsigned int to avoid + problems with sign-extension. */ +#define ORD(c) (unsigned int)(c) + +/* The inverse of ORD. */ +#define CHR(i) (unsigned char)(i) + +/* The value for Spec_list->state that indicates to + get_next that it should initialize the tail pointer. + Its value doesn't matter as long as it can't be + confused with a valid character code. */ +#define BEGIN_STATE (2 * N_CHARS) + +/* The value for Spec_list->state that indicates to + get_next that the element pointed to by Spec_list->tail is + being considered for the first time on this pass through the + list -- it indicates that get_next should make any necessary + initializations. */ +#define NEW_ELEMENT (BEGIN_STATE + 1) + +/* A value distinct from any character that may have been stored in a + buffer as the result of a block-read in the function squeeze_filter. */ +#define NOT_A_CHAR (unsigned int)(-1) + +/* The following (but not CC_NO_CLASS) are indices into the array of + valid character class strings. */ +enum Char_class +{ + CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3, + CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7, + CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11, + CC_NO_CLASS = 9999 +}; + +/* Character class to which a character (returned by get_next) belonged; + but it is set only if the construct from which the character was obtained + was one of the character classes [:upper:] or [:lower:]. The value + is used only when translating and then, only to make sure that upper + and lower class constructs have the same relative positions in string1 + and string2. */ +enum Upper_Lower_class +{ + UL_LOWER = 0, + UL_UPPER = 1, + UL_NONE = 2 +}; + +/* A shortcut to ensure that when constructing the translation array, + one of the values returned by paired calls to get_next (from s1 and s2) is + from [:upper:] and the other is from [:lower:], or neither is + from upper or lower. In fact, no other character classes are allowed + when translating, but that condition is tested elsewhere. This array + is indexed by values of type enum Upper_Lower_class. */ +static int class_ok[3][3] = +{ + {0, 1, 0}, + {1, 0, 0}, + {0, 0, 1} +}; + +/* The type of a List_element. See build_spec_list for more details. */ +enum Range_element_type +{ + RE_NO_TYPE = 0, + RE_NORMAL_CHAR, + RE_RANGE, + RE_CHAR_CLASS, + RE_EQUIV_CLASS, + RE_REPEATED_CHAR +}; + +/* One construct in one of tr's argument strings. + For example, consider the POSIX version of the + classic tr command: + tr -cs 'a-zA-Z_' '[\n*]' + String1 has 3 constructs, two of which are ranges (a-z and A-Z), + and a single normal character, `_'. String2 has one construct. */ +struct List_element +{ + enum Range_element_type type; + struct List_element *next; + union + { + int normal_char; + struct /* unnamed */ + { + unsigned int first_char; + unsigned int last_char; + } range; + enum Char_class char_class; + int equiv_code; + struct /* unnamed */ + { + unsigned int the_repeated_char; + long repeat_count; + } repeated_char; + } u; +}; + +/* Each of tr's argument strings is parsed into a form that is easier + to work with: a linked list of constructs (struct List_element). + Each Spec_list structure also encapsulates various attributes of + the corresponding argument string. The attributes are used mainly + to verify that the strings are legal in the context of any options + specified (like -s, -d, or -c). The main exception is the member + `tail', which is first used to construct the list. After construction, + it is used by get_next to save its state when traversing the list. + The member `state' serves a similar function. */ +struct Spec_list +{ + /* Points to the head of the list of range elements. + The first struct is a dummy; its members are never used. */ + struct List_element *head; + + /* When appending, points to the last element. When traversing via + get_next(), points to the element to process next. Setting + Spec_list.state to the value BEGIN_STATE before calling get_next + signals get_next to initialize tail to point to head->next. */ + struct List_element *tail; + + /* Used to save state between calls to get_next(). */ + unsigned int state; + + /* Length, in the sense that length('a-z[:digit:]123abc') + is 42 ( = 26 + 10 + 6). */ + int length; + + /* The number of [c*] and [c*0] constructs that appear in this spec. */ + int n_indefinite_repeats; + + /* Non-zero if this spec contains at least one equivalence + class construct e.g. [=c=]. */ + int has_equiv_class; + + /* Non-zero if this spec contains at least one of [:upper:] or + [:lower:] class constructs. */ + int has_upper_or_lower; + + /* Non-zero if this spec contains at least one of the character class + constructs (all but upper and lower) that aren't allowed in s2. */ + int has_restricted_char_class; +}; + +char *xmalloc (); +char *stpcpy (); +void error (); + +/* The name by which this program was run. */ +char *program_name; + +/* When non-zero, each sequence in the input of a repeated character + (call it c) is replaced (in the output) by a single occurrence of c + for every c in the squeeze set. */ +static int squeeze_repeats = 0; + +/* When non-zero, removes characters in the delete set from input. */ +static int delete = 0; + +/* Use the complement of set1 in place of set1. */ +static int complement = 0; + +/* When non-zero, this flag causes GNU tr to provide strict + compliance with POSIX draft 1003.2.11.2. The POSIX spec + says that when -d is used without -s, string2 (if present) + must be ignored. Silently ignoring arguments is a bad idea. + The default GNU behavior is to give a usage message and exit. + Additionally, when this flag is non-zero, tr prints warnings + on stderr if it is being used in a manner that is not portable. + Applicable warnings are given by default, but are suppressed + if the environment variable `POSIXLY_CORRECT' is set, since + being POSIX conformant means we can't issue such messages. + Warnings on the following topics are suppressed when this + variable is non-zero: + 1. Ambiguous octal escapes. */ +static int posix_pedantic; + +/* When tr is performing translation and string1 is longer than string2, + POSIX says that the result is undefined. That gives the implementor + of a POSIX conforming version of tr two reasonable choices for the + semantics of this case. + + * The BSD tr pads string2 to the length of string1 by + repeating the last character in string2. + + * System V tr ignores characters in string1 that have no + corresponding character in string2. That is, string1 is effectively + truncated to the length of string2. + + When non-zero, this flag causes GNU tr to imitate the behavior + of System V tr when translating with string1 longer than string2. + The default is to emulate BSD tr. This flag is ignored in modes where + no translation is performed. Emulating the System V tr + in this exceptional case causes the relatively common BSD idiom: + + tr -cs A-Za-z0-9 '\012' + + to break (it would convert only zero bytes, rather than all + non-alphanumerics, to newlines). + + WARNING: This switch does not provide general BSD or System V + compatibility. For example, it doesn't disable the interpretation + of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by + some unfortunate coincidence you use such constructs in scripts + expecting to use some other version of tr, the scripts will break. */ +static int truncate_set1 = 0; + +/* An alias for (!delete && non_option_args == 2). + It is set in main and used there and in validate(). */ +static int translating; + +#ifndef BUFSIZ +#define BUFSIZ 8192 +#endif + +#define IO_BUF_SIZE BUFSIZ +static unsigned char io_buf[IO_BUF_SIZE]; + +char *char_class_name[] = +{ + "alnum", "alpha", "blank", "cntrl", "digit", "graph", + "lower", "print", "punct", "space", "upper", "xdigit" +}; +#define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0])) + +typedef char SET_TYPE; + +/* Array of boolean values. A character `c' is a member of the + squeeze set if and only if in_squeeze_set[c] is true. The squeeze + set is defined by the last (possibly, the only) string argument + on the command line when the squeeze option is given. */ +static SET_TYPE in_squeeze_set[N_CHARS]; + +/* Array of boolean values. A character `c' is a member of the + delete set if and only if in_delete_set[c] is true. The delete + set is defined by the first (or only) string argument on the + command line when the delete option is given. */ +static SET_TYPE in_delete_set[N_CHARS]; + +/* Array of character values defining the translation (if any) that + tr is to perform. Translation is performed only when there are + two specification strings and the delete switch is not given. */ +static char xlate[N_CHARS]; + +static struct option long_options[] = +{ + {"complement", 0, NULL, 'c'}, + {"delete", 0, NULL, 'd'}, + {"squeeze-repeats", 0, NULL, 's'}, + {"truncate-set1", 0, NULL, 't'}, + {NULL, 0, NULL, 0} +}; + + +static void +usage () +{ + fprintf (stderr, "\ +Usage: %s [-cdst] [--complement] [--delete] [--squeeze-repeats]\n\ + [--truncate-set1] string1 [string2]\n", + program_name); + exit (2); +} + +/* Return non-zero if the character C is a member of the + equivalence class containing the character EQUIV_CLASS. */ + +static int +is_equiv_class_member (equiv_class, c) + unsigned int equiv_class; + unsigned int c; +{ + return (equiv_class == c); +} + +/* Return non-zero if the character C is a member of the + character class CHAR_CLASS. */ + +static int +is_char_class_member (char_class, c) + enum Char_class char_class; + unsigned int c; +{ + switch (char_class) + { + case CC_ALNUM: + return isalnum (c); + break; + case CC_ALPHA: + return isalpha (c); + break; + case CC_BLANK: + return isblank (c); + break; + case CC_CNTRL: + return iscntrl (c); + break; + case CC_DIGIT: + return isdigit (c); + break; + case CC_GRAPH: + return isgraph (c); + break; + case CC_LOWER: + return islower (c); + break; + case CC_PRINT: + return isprint (c); + break; + case CC_PUNCT: + return ispunct (c); + break; + case CC_SPACE: + return isspace (c); + break; + case CC_UPPER: + return isupper (c); + break; + case CC_XDIGIT: + return isxdigit (c); + break; + case CC_NO_CLASS: + abort (); + return 0; + break; + } +} + +/* Perform the first pass over each range-spec argument S, + converting all \c and \ddd escapes to their one-byte representations. + The conversion is done in-place, so S must point to writable + storage. If an illegal quote sequence is found, an error message is + printed and the function returns non-zero. Otherwise the length of + the resulting string is returned through LEN and the function returns 0. + The resulting array of characters may contain zero-bytes; however, + on input, S is assumed to be null-terminated, and hence + cannot contain actual (non-escaped) zero bytes. */ + +static int +unquote (s, len) + unsigned char *s; + int *len; +{ + int i, j; + + j = 0; + for (i = 0; s[i]; i++) + { + switch (s[i]) + { + int c; + case '\\': + switch (s[i + 1]) + { + int oct_digit; + case '\\': + c = '\\'; + break; + case 'a': + c = '\007'; + break; + case 'b': + c = '\b'; + break; + case 'f': + c = '\f'; + break; + case 'n': + c = '\n'; + break; + case 'r': + c = '\r'; + break; + case 't': + c = '\t'; + break; + case 'v': + c = '\v'; + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + c = s[i + 1] - '0'; + oct_digit = s[i + 2] - '0'; + if (0 <= oct_digit && oct_digit <= 7) + { + c = 8 * c + oct_digit; + ++i; + oct_digit = s[i + 2] - '0'; + if (0 <= oct_digit && oct_digit <= 7) + { + if (8 * c + oct_digit < N_CHARS) + { + c = 8 * c + oct_digit; + ++i; + } + else if (!posix_pedantic) + { + /* Any octal number larger than 0377 won't + fit in 8 bits. So we stop when adding the + next digit would put us over the limit and + give a warning about the ambiguity. POSIX + isn't clear on this, but one person has said + that in his interpretation, POSIX says tr + can't even give a warning. */ + error (0, 0, "warning: the ambiguous octal escape \ +\\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'", + s[i], s[i + 1], s[i + 2], + s[i], s[i + 1], s[i + 2]); + } + } + } + break; + case '\0': + error (0, 0, "invalid backslash escape at end of string"); + return 1; + break; + default: + error (0, 0, "invalid backslash escape `\\%c'", s[i + 1]); + return 1; + break; + } + ++i; + s[j++] = c; + break; + default: + s[j++] = s[i]; + break; + } + } + *len = j; + return 0; +} + +/* If CLASS_STR is a valid character class string, return its index + in the global char_class_name array. Otherwise, return CC_NO_CLASS. */ + +static enum Char_class +look_up_char_class (class_str) + unsigned char *class_str; +{ + unsigned int i; + + for (i = 0; i < N_CHAR_CLASSES; i++) + if (strcmp (class_str, char_class_name[i]) == 0) + return (enum Char_class) i; + return CC_NO_CLASS; +} + +/* Return a newly allocated string with a printable version of C. + This function is used solely for formatting error messages. */ + +static char * +make_printable_char (c) + unsigned int c; +{ + char *buf = xmalloc (5); + + assert (c < N_CHARS); + if (isprint (c)) + { + buf[0] = c; + buf[1] = '\0'; + } + else + { + sprintf (buf, "\\%03o", c); + } + return buf; +} + +/* Return a newly allocated copy of S which is suitable for printing. + LEN is the number of characters in S. Most non-printing + (isprint) characters are represented by a backslash followed by + 3 octal digits. However, the characters represented by \c escapes + where c is one of [abfnrtv] are represented by their 2-character \c + sequences. This function is used solely for printing error messages. */ + +static char * +make_printable_str (s, len) + unsigned char *s; + int len; +{ + /* Worst case is that every character expands to a backslash + followed by a 3-character octal escape sequence. */ + char *printable_buf = xmalloc (4 * len + 1); + char *p = printable_buf; + int i; + + for (i = 0; i < len; i++) + { + char buf[5]; + char *tmp = NULL; + + switch (s[i]) + { + case '\\': + tmp = "\\"; + break; + case '\007': + tmp = "\\a"; + break; + case '\b': + tmp = "\\b"; + break; + case '\f': + tmp = "\\f"; + break; + case '\n': + tmp = "\\n"; + break; + case '\r': + tmp = "\\r"; + break; + case '\t': + tmp = "\\t"; + break; + case '\v': + tmp = "\\v"; + break; + default: + if (isprint (s[i])) + { + buf[0] = s[i]; + buf[1] = '\0'; + } + else + sprintf (buf, "\\%03o", s[i]); + tmp = buf; + break; + } + p = stpcpy (p, tmp); + } + return printable_buf; +} + +/* Append a newly allocated structure representing a + character C to the specification list LIST. */ + +static void +append_normal_char (list, c) + struct Spec_list *list; + unsigned int c; +{ + struct List_element *new; + + new = (struct List_element *) xmalloc (sizeof (struct List_element)); + new->next = NULL; + new->type = RE_NORMAL_CHAR; + new->u.normal_char = c; + assert (list->tail); + list->tail->next = new; + list->tail = new; +} + +/* Append a newly allocated structure representing the range + of characters from FIRST to LAST to the specification list LIST. + Return non-zero if LAST precedes FIRST in the collating sequence, + zero otherwise. This means that '[c-c]' is acceptable. */ + +static int +append_range (list, first, last) + struct Spec_list *list; + unsigned int first; + unsigned int last; +{ + struct List_element *new; + + if (ORD (first) > ORD (last)) + { + char *tmp1 = make_printable_char (first); + char *tmp2 = make_printable_char (last); + + error (0, 0, + "range-endpoints of `%s-%s' are in reverse collating sequence order", + tmp1, tmp2); + free (tmp1); + free (tmp2); + return 1; + } + new = (struct List_element *) xmalloc (sizeof (struct List_element)); + new->next = NULL; + new->type = RE_RANGE; + new->u.range.first_char = first; + new->u.range.last_char = last; + assert (list->tail); + list->tail->next = new; + list->tail = new; + return 0; +} + +/* If CHAR_CLASS_STR is a valid character class string, append a + newly allocated structure representing that character class to the end + of the specification list LIST and return 0. If CHAR_CLASS_STR is not + a valid string, give an error message and return non-zero. */ + +static int +append_char_class (list, char_class_str, len) + struct Spec_list *list; + unsigned char *char_class_str; + int len; +{ + enum Char_class char_class; + struct List_element *new; + + char_class = look_up_char_class (char_class_str); + if (char_class == CC_NO_CLASS) + { + char *tmp = make_printable_str (char_class_str, len); + + error (0, 0, "invalid character class `%s'", tmp); + free (tmp); + return 1; + } + new = (struct List_element *) xmalloc (sizeof (struct List_element)); + new->next = NULL; + new->type = RE_CHAR_CLASS; + new->u.char_class = char_class; + assert (list->tail); + list->tail->next = new; + list->tail = new; + return 0; +} + +/* Append a newly allocated structure representing a [c*n] + repeated character construct, to the specification list LIST. + THE_CHAR is the single character to be repeated, and REPEAT_COUNT + is non-negative repeat count. */ + +static void +append_repeated_char (list, the_char, repeat_count) + struct Spec_list *list; + unsigned int the_char; + long int repeat_count; +{ + struct List_element *new; + + new = (struct List_element *) xmalloc (sizeof (struct List_element)); + new->next = NULL; + new->type = RE_REPEATED_CHAR; + new->u.repeated_char.the_repeated_char = the_char; + new->u.repeated_char.repeat_count = repeat_count; + assert (list->tail); + list->tail->next = new; + list->tail = new; +} + +/* Given a string, EQUIV_CLASS_STR, from a [=str=] context and + the length of that string, LEN, if LEN is exactly one, append + a newly allocated structure representing the specified + equivalence class to the specification list, LIST and return zero. + If LEN is not 1, issue an error message and return non-zero. */ + +static int +append_equiv_class (list, equiv_class_str, len) + struct Spec_list *list; + unsigned char *equiv_class_str; + int len; +{ + struct List_element *new; + + if (len != 1) + { + char *tmp = make_printable_str (equiv_class_str, len); + + error (0, 0, "%s: equivalence class operand must be a single character", + tmp); + free (tmp); + return 1; + } + new = (struct List_element *) xmalloc (sizeof (struct List_element)); + new->next = NULL; + new->type = RE_EQUIV_CLASS; + new->u.equiv_code = *equiv_class_str; + assert (list->tail); + list->tail->next = new; + list->tail = new; + return 0; +} + +/* Return a newly allocated copy of P[FIRST_IDX..LAST_IDX]. */ + +static unsigned char * +substr (p, first_idx, last_idx) + unsigned char *p; + int first_idx; + int last_idx; +{ + int len = last_idx - first_idx + 1; + unsigned char *tmp = (unsigned char *) xmalloc (len); + + assert (first_idx <= last_idx); + /* We must use bcopy or memcopy rather than strncpy + because `p' may contain zero-bytes. */ + bcopy (p + first_idx, tmp, len); + tmp[len] = '\0'; + return tmp; +} + +/* Search forward starting at START_IDX for the 2-char sequence + (PRE_BRACKET_CHAR,']') in the string P of length P_LEN. If such + a sequence is found, return the index of the first character, + otherwise return -1. P may contain zero bytes. */ + +static int +find_closing_delim (p, start_idx, p_len, pre_bracket_char) + unsigned char *p; + int start_idx; + int p_len; + unsigned int pre_bracket_char; +{ + int i; + + for (i = start_idx; i < p_len - 1; i++) + if (p[i] == pre_bracket_char && p[i + 1] == ']') + return i; + return -1; +} + +/* Convert a string S with explicit length LEN, possibly + containing embedded zero bytes, to a long integer value. + If the string represents a negative value, a value larger + than LONG_MAX, or if all LEN characters do not represent a + valid integer, return non-zero and do not modify *VAL. + Otherwise, return zero and set *VAL to the converted value. */ + +static int +non_neg_strtol (s, len, val) + unsigned char *s; + int len; + long int *val; +{ + int i; + long sum = 0; + unsigned int base; + + if (len <= 0) + return 1; + if (s[0] == '0') + base = 8; + else if (isdigit (s[0])) + base = 10; + else + return 1; + + for (i = 0; i < len; i++) + { + int c = s[i] - '0'; + + if (c >= base || c < 0) + return 1; + if (i > 8 && sum > (LONG_MAX - c) / base) + return 1; + sum = sum * base + c; + } + *val = sum; + return 0; +} + +/* Parse the bracketed repeat-char syntax. If the P_LEN characters + beginning with P[ START_IDX ] comprise a valid [c*n] construct, + return the character and the repeat count through the arg pointers, + CHAR_TO_REPEAT and N, and then return the index of the closing + bracket as the function value. If the second character following + the opening bracket is not `*' or if no closing bracket can be + found, return -1. If a closing bracket is found and the + second char is `*', but the string between the `*' and `]' isn't + empty, an octal number, or a decimal number, print an error message + and return -2. */ + +static int +find_bracketed_repeat (p, start_idx, p_len, char_to_repeat, n) + unsigned char *p; + int start_idx; + int p_len; + unsigned int *char_to_repeat; + long int *n; +{ + int i; + + assert (start_idx + 1 < p_len); + if (p[start_idx + 1] != '*') + return -1; + + for (i = start_idx + 2; i < p_len; i++) + { + if (p[i] == ']') + { + unsigned char *digit_str; + int digit_str_len = i - start_idx - 2; + + *char_to_repeat = p[start_idx]; + if (digit_str_len == 0) + { + /* We've matched [c*] -- no explicit repeat count. */ + *n = 0; + return i; + } + + /* Here, we have found [c*s] where s should be a string + of octal or decimal digits. */ + digit_str = &p[start_idx + 2]; + if (non_neg_strtol (digit_str, digit_str_len, n)) + { + char *tmp = make_printable_str (digit_str, digit_str_len); + error (0, 0, "invalid repeat count `%s' in [c*n] construct", tmp); + free (tmp); + return -2; + } + return i; + } + } + return -1; /* No bracket found. */ +} + +/* Convert string UNESACPED_STRING (which has been preprocessed to + convert backslash-escape sequences) of length LEN characters into + a linked list of the following 5 types of constructs: + - [:str:] Character class where `str' is one of the 12 valid strings. + - [=c=] Equivalence class where `c' is any single character. + - [c*n] Repeat the single character `c' `n' times. n may be omitted. + However, if `n' is present, it must be a non-negative octal or + decimal integer. + - r-s Range of characters from `r' to `s'. The second endpoint must + not precede the first in the current collating sequence. + - c Any other character is interpreted as itself. */ + +static int +build_spec_list (unescaped_string, len, result) + unsigned char *unescaped_string; + int len; + struct Spec_list *result; +{ + unsigned char *p; + int i; + + p = unescaped_string; + + /* The main for-loop below recognizes the 4 multi-character constructs. + A character that matches (in its context) none of the multi-character + constructs is classified as `normal'. Since all multi-character + constructs have at least 3 characters, any strings of length 2 or + less are composed solely of normal characters. Hence, the index of + the outer for-loop runs only as far as LEN-2. */ + + for (i = 0; i < len - 2;) + { + switch (p[i]) + { + int fall_through; + case '[': + fall_through = 0; + switch (p[i + 1]) + { + int closing_delim_idx; + int closing_bracket_idx; + unsigned int char_to_repeat; + long repeat_count; + case ':': + case '=': + closing_delim_idx = find_closing_delim (p, i + 2, len, p[i + 1]); + if (closing_delim_idx >= 0) + { + int parse_failed; + unsigned char *opnd_str = substr (p, i + 2, closing_delim_idx - 1); + if (p[i + 1] == ':') + parse_failed = append_char_class (result, opnd_str, + (closing_delim_idx - 1) - (i + 2) + 1); + else + parse_failed = append_equiv_class (result, opnd_str, + (closing_delim_idx - 1) - (i + 2) + 1); + free (opnd_str); + + /* Return non-zero if append_*_class reports a problem. */ + if (parse_failed) + return 1; + else + i = closing_delim_idx + 2; + break; + } + /* Else fall through. This could be [:*] or [=*]. */ + default: + /* Determine whether this is a bracketed repeat range + matching the RE \[.\*(dec_or_oct_number)?\]. */ + closing_bracket_idx = find_bracketed_repeat (p, i + 1, + len, &char_to_repeat, &repeat_count); + if (closing_bracket_idx >= 0) + { + append_repeated_char (result, char_to_repeat, repeat_count); + i = closing_bracket_idx + 1; + break; + } + else if (closing_bracket_idx == -1) + { + fall_through = 1; + } + else + /* Found a string that looked like [c*n] but the + numeric part was invalid. */ + return 1; + break; + } + if (!fall_through) + break; + + /* Here if we've tried to match [c*n], [:str:], and [=c=] + and none of them fit. So we still have to consider the + range `[-c' (from `[' to `c'). */ + default: + /* Look ahead one char for ranges like a-z. */ + if (p[i + 1] == '-') + { + if (append_range (result, p[i], p[i + 2])) + return 1; + i += 3; + } + else + { + append_normal_char (result, p[i]); + ++i; + } + break; + } + } + + /* Now handle the (2 or fewer) remaining characters p[i]..p[len - 1]. */ + for (; i < len; i++) + append_normal_char (result, p[i]); + + return 0; +} + + +/* Given a Spec_list S (with its saved state implicit in the values + of its members `tail' and `state'), return the next single character + in the expansion of S's constructs. If the last character of S was + returned on the previous call or if S was empty, this function + returns -1. For example, successive calls to get_next where S + represents the spec-string 'a-d[y*3]' will return the sequence + of values a, b, c, d, y, y, y, -1. Finally, if the construct from + which the returned character comes is [:upper:] or [:lower:], the + parameter CLASS is given a value to indicate which it was. Otherwise + CLASS is set to UL_NONE. This value is used only when constructing + the translation table to verify that any occurrences of upper and + lower class constructs in the spec-strings appear in the same relative + positions. */ + +static int +get_next (s, class) + struct Spec_list *s; + enum Upper_Lower_class *class; +{ + struct List_element *p; + int return_val; + int i; + + if (class) + *class = UL_NONE; + + if (s->state == BEGIN_STATE) + { + s->tail = s->head->next; + s->state = NEW_ELEMENT; + } + + p = s->tail; + if (p == NULL) + return -1; + + switch (p->type) + { + case RE_NORMAL_CHAR: + return_val = p->u.normal_char; + s->state = NEW_ELEMENT; + s->tail = p->next; + break; + + case RE_RANGE: + if (s->state == NEW_ELEMENT) + s->state = ORD (p->u.range.first_char); + else + ++(s->state); + return_val = CHR (s->state); + if (s->state == ORD (p->u.range.last_char)) + { + s->tail = p->next; + s->state = NEW_ELEMENT; + } + break; + + case RE_CHAR_CLASS: + if (s->state == NEW_ELEMENT) + { + for (i = 0; i < N_CHARS; i++) + if (is_char_class_member (p->u.char_class, i)) + break; + assert (i < N_CHARS); + s->state = i; + } + assert (is_char_class_member (p->u.char_class, s->state)); + return_val = CHR (s->state); + for (i = s->state + 1; i < N_CHARS; i++) + if (is_char_class_member (p->u.char_class, i)) + break; + if (i < N_CHARS) + s->state = i; + else + { + s->tail = p->next; + s->state = NEW_ELEMENT; + } + if (class) + { + switch (p->u.char_class) + { + case CC_LOWER: + *class = UL_LOWER; + break; + case CC_UPPER: + *class = UL_UPPER; + break; + default: + /* empty */ + break; + } + } + break; + + case RE_EQUIV_CLASS: + /* FIXME: this assumes that each character is alone in its own + equivalence class (which appears to be correct for my + LC_COLLATE. But I don't know of any function that allows + one to determine a character's equivalence class. */ + + return_val = p->u.equiv_code; + s->state = NEW_ELEMENT; + s->tail = p->next; + break; + + case RE_REPEATED_CHAR: + /* Here, a repeat count of n == 0 means don't repeat at all. */ + assert (p->u.repeated_char.repeat_count >= 0); + if (p->u.repeated_char.repeat_count == 0) + { + s->tail = p->next; + s->state = NEW_ELEMENT; + return_val = get_next (s, class); + } + else + { + if (s->state == NEW_ELEMENT) + { + s->state = 0; + } + ++(s->state); + return_val = p->u.repeated_char.the_repeated_char; + if (p->u.repeated_char.repeat_count > 0 + && s->state == p->u.repeated_char.repeat_count) + { + s->tail = p->next; + s->state = NEW_ELEMENT; + } + } + break; + + case RE_NO_TYPE: + abort (); + break; + } + return return_val; +} + +/* This is a minor kludge. This function is called from + get_spec_stats to determine the cardinality of a set derived + from a complemented string. It's a kludge in that some of + the same operations are (duplicated) performed in set_initialize. */ + +static int +card_of_complement (s) + struct Spec_list *s; +{ + int c; + int cardinality = N_CHARS; + SET_TYPE in_set[N_CHARS]; + + bzero (in_set, N_CHARS * sizeof (in_set[0])); + s->state = BEGIN_STATE; + while ((c = get_next (s, NULL)) != -1) + if (!in_set[c]++) + --cardinality; + return cardinality; +} + +/* Gather statistics about the spec-list S in preparation for the tests + in validate that determine the legality of the specs. This function + is called at most twice; once for string1, and again for any string2. + LEN_S1 < 0 indicates that this is the first call and that S represents + string1. When LEN_S1 >= 0, it is the length of the expansion of the + constructs in string1, and we can use its value to resolve any + indefinite repeat construct in S (which represents string2). Hence, + this function has the side-effect that it converts a valid [c*] + construct in string2 to [c*n] where n is large enough (or 0) to give + string2 the same length as string1. For example, with the command + tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would + be 26 and S (representing string2) would be converted to 'A[\n*24]Z'. */ + +static void +get_spec_stats (s, len_s1) + struct Spec_list *s; + int len_s1; +{ + struct List_element *p; + struct List_element *indefinite_repeat_element = NULL; + int len = 0; + + s->n_indefinite_repeats = 0; + s->has_equiv_class = 0; + s->has_restricted_char_class = 0; + s->has_upper_or_lower = 0; + for (p = s->head->next; p; p = p->next) + { + switch (p->type) + { + int i; + case RE_NORMAL_CHAR: + ++len; + break; + + case RE_RANGE: + assert (p->u.range.last_char >= p->u.range.first_char); + len += p->u.range.last_char - p->u.range.first_char + 1; + break; + + case RE_CHAR_CLASS: + for (i = 0; i < N_CHARS; i++) + if (is_char_class_member (p->u.char_class, i)) + ++len; + switch (p->u.char_class) + { + case CC_UPPER: + case CC_LOWER: + s->has_upper_or_lower = 1; + break; + default: + s->has_restricted_char_class = 1; + break; + } + break; + + case RE_EQUIV_CLASS: + for (i = 0; i < N_CHARS; i++) + if (is_equiv_class_member (p->u.equiv_code, i)) + ++len; + s->has_equiv_class = 1; + break; + + case RE_REPEATED_CHAR: + if (p->u.repeated_char.repeat_count > 0) + len += p->u.repeated_char.repeat_count; + else if (p->u.repeated_char.repeat_count == 0) + { + indefinite_repeat_element = p; + ++(s->n_indefinite_repeats); + } + break; + + case RE_NO_TYPE: + assert (0); + break; + } + } + + if (len_s1 >= len && s->n_indefinite_repeats == 1) + { + indefinite_repeat_element->u.repeated_char.repeat_count = len_s1 - len; + len = len_s1; + } + if (complement && len_s1 < 0) + s->length = card_of_complement (s); + else + s->length = len; + return; +} + +static void +spec_init (spec_list) + struct Spec_list *spec_list; +{ + spec_list->head = spec_list->tail = + (struct List_element *) xmalloc (sizeof (struct List_element)); + spec_list->head->next = NULL; +} + +/* This function makes two passes over the argument string S. The first + one converts all \c and \ddd escapes to their one-byte representations. + The second constructs a linked specification list, SPEC_LIST, of the + characters and constructs that comprise the argument string. If either + of these passes detects an error, this function returns non-zero. */ + +static int +parse_str (s, spec_list) + unsigned char *s; + struct Spec_list *spec_list; +{ + int len; + + if (unquote (s, &len)) + return 1; + if (build_spec_list (s, len, spec_list)) + return 1; + return 0; +} + +/* Given two specification lists, S1 and S2, and assuming that + S1->length > S2->length, append a single [c*n] element to S2 where c + is the last character in the expansion of S2 and n is the difference + between the two lengths. + Upon successful completion, S2->length is set to S1->length. The only + way this function can fail to make S2 as long as S1 is when S2 has + zero-length, since in that case, there is no last character to repeat. + So S2->length is required to be at least 1. + + Providing this functionality allows the user to do some pretty + non-BSD (and non-portable) things: For example, the command + tr -cs '[:upper:]0-9' '[:lower:]' + is almost guaranteed to give results that depend on your collating + sequence. */ + +static void +string2_extend (s1, s2) + struct Spec_list *s1; + struct Spec_list *s2; +{ + struct List_element *p; + int char_to_repeat; + int i; + + assert (translating); + assert (s1->length > s2->length); + assert (s2->length > 0); + + p = s2->tail; + switch (p->type) + { + case RE_NORMAL_CHAR: + char_to_repeat = p->u.normal_char; + break; + case RE_RANGE: + char_to_repeat = p->u.range.last_char; + break; + case RE_CHAR_CLASS: + for (i = N_CHARS; i >= 0; i--) + if (is_char_class_member (p->u.char_class, i)) + break; + assert (i >= 0); + char_to_repeat = CHR (i); + break; + + case RE_REPEATED_CHAR: + char_to_repeat = p->u.repeated_char.the_repeated_char; + break; + + case RE_EQUIV_CLASS: + /* This shouldn't happen, because validate exits with an error + if it finds an equiv class in string2 when translating. */ + abort (); + break; + + case RE_NO_TYPE: + abort (); + break; + } + append_repeated_char (s2, char_to_repeat, s1->length - s2->length); + s2->length = s1->length; + return; +} + +/* Die with an error message if S1 and S2 describe strings that + are not valid with the given command line switches. + A side effect of this function is that if a legal [c*] or + [c*0] construct appears in string2, it is converted to [c*n] + with a value for n that makes s2->length == s1->length. By + the same token, if the --truncate-set1 option is not + given, S2 may be extended. */ + +static void +validate (s1, s2) + struct Spec_list *s1; + struct Spec_list *s2; +{ + get_spec_stats (s1, -1); + if (s1->n_indefinite_repeats > 0) + { + error (1, 0, "the [c*] repeat construct may not appear in string1"); + } + + /* FIXME: it isn't clear from the POSIX spec that this is illegal, + but in the spirit of the other restrictions put on translation + with character classes, this seems a logical interpretation. */ + if (complement && s1->has_upper_or_lower) + { + error (1, 0, + "character classes may not be used when translating and complementing"); + } + + if (s2) + { + get_spec_stats (s2, s1->length); + if (s2->has_restricted_char_class) + { + error (1, 0, + "when translating, the only character classes that may appear in\n\ +\tstring2 are `upper' and `lower'"); + } + + if (s2->n_indefinite_repeats > 1) + { + error (1, 0, "only one [c*] repeat construct may appear in string2"); + } + + if (translating) + { + if (s2->has_equiv_class) + { + error (1, 0, + "[=c=] expressions may not appear in string2 when translating"); + } + + if (s1->length > s2->length) + { + if (!truncate_set1) + { + /* string2 must be non-empty unless --truncate-set1 is + given or string1 is empty. */ + + if (s2->length == 0) + error (1, 0, + "when not truncating set1, string2 must be non-empty"); + string2_extend (s1, s2); + } + } + + if (complement && s2->has_upper_or_lower) + error (1, 0, + "character classes may not be used when translating and complementing"); + } + else + /* Not translating. */ + { + if (s2->n_indefinite_repeats > 0) + error (1, 0, + "the [c*] construct may appear in string2 only when translating"); + } + } +} + +/* Read buffers of SIZE bytes via the function READER (if READER is + NULL, read from stdin) until EOF. When non-NULL, READER is either + read_and_delete or read_and_xlate. After each buffer is read, it is + processed and written to stdout. The buffers are processed so that + multiple consecutive occurrences of the same character in the input + stream are replaced by a single occurrence of that character if the + character is in the squeeze set. */ + +static void +squeeze_filter (buf, size, reader) + unsigned char *buf; + long int size; + PFI reader; +{ + unsigned int char_to_squeeze = NOT_A_CHAR; + int i = 0; + int nr = 0; + + for (;;) + { + int begin; + + if (i >= nr) + { + if (reader == NULL) + nr = read (0, (char *) buf, size); + else + nr = (*reader) (buf, size, NULL); + + if (nr < 0) + error (1, errno, "read error"); + if (nr == 0) + break; + i = 0; + } + + begin = i; + + if (char_to_squeeze == NOT_A_CHAR) + { + int out_len; + /* Here, by being a little tricky, we can get a significant + performance increase in most cases when the input is + reasonably large. Since tr will modify the input only + if two consecutive (and identical) input characters are + in the squeeze set, we can step by two through the data + when searching for a character in the squeeze set. This + means there may be a little more work in a few cases and + perhaps twice as much work in the worst cases where most + of the input is removed by squeezing repeats. But most + uses of this functionality seem to remove less than 20-30% + of the input. */ + for (; i < nr && !in_squeeze_set[buf[i]]; i += 2) + ; /* empty */ + + /* There is a special case when i == nr and we've just + skipped a character (the last one in buf) that is in + the squeeze set. */ + if (i == nr && in_squeeze_set[buf[i - 1]]) + --i; + + if (i >= nr) + out_len = nr - begin; + else + { + char_to_squeeze = buf[i]; + /* We're about to output buf[begin..i]. */ + out_len = i - begin + 1; + + /* But since we stepped by 2 in the loop above, + out_len may be one too large. */ + if (i > 0 && buf[i - 1] == char_to_squeeze) + --out_len; + + /* Advance i to the index of first character to be + considered when looking for a char different from + char_to_squeeze. */ + ++i; + } + if (out_len > 0 + && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0) + error (1, errno, "write error"); + } + + if (char_to_squeeze != NOT_A_CHAR) + { + /* Advance i to index of first char != char_to_squeeze + (or to nr if all the rest of the characters in this + buffer are the same as char_to_squeeze). */ + for (; i < nr && buf[i] == char_to_squeeze; i++) + ; /* empty */ + if (i < nr) + char_to_squeeze = NOT_A_CHAR; + /* If (i >= nr) we've squeezed the last character in this buffer. + So now we have to read a new buffer and continue comparing + characters against char_to_squeeze. */ + } + } +} + +/* Read buffers of SIZE bytes from stdin until one is found that + contains at least one character not in the delete set. Store + in the array BUF, all characters from that buffer that are not + in the delete set, and return the number of characters saved + or 0 upon EOF. */ + +static long +read_and_delete (buf, size, not_used) + unsigned char *buf; + long int size; + PFI not_used; +{ + long n_saved; + static int hit_eof = 0; + + assert (not_used == NULL); + assert (size > 0); + + if (hit_eof) + return 0; + + /* This enclosing do-while loop is to make sure that + we don't return zero (indicating EOF) when we've + just deleted all the characters in a buffer. */ + do + { + int i; + int nr = read (0, (char *) buf, size); + + if (nr < 0) + error (1, errno, "read error"); + if (nr == 0) + { + hit_eof = 1; + return 0; + } + + /* This first loop may be a waste of code, but gives much + better performance when no characters are deleted in + the beginning of a buffer. It just avoids the copying + of buf[i] into buf[n_saved] when it would be a NOP. */ + + for (i = 0; i < nr && !in_delete_set[buf[i]]; i++) + /* empty */ ; + n_saved = i; + + for (++i; i < nr; i++) + if (!in_delete_set[buf[i]]) + buf[n_saved++] = buf[i]; + } + while (n_saved == 0); + + return n_saved; +} + +/* Read at most SIZE bytes from stdin into the array BUF. Then + perform the in-place and one-to-one mapping specified by the global + array `xlate'. Return the number of characters read, or 0 upon EOF. */ + +static long +read_and_xlate (buf, size, not_used) + unsigned char *buf; + long int size; + PFI not_used; +{ + long chars_read = 0; + static int hit_eof = 0; + int i; + + assert (not_used == NULL); + assert (size > 0); + + if (hit_eof) + return 0; + + chars_read = read (0, (char *) buf, size); + if (chars_read < 0) + error (1, errno, "read error"); + if (chars_read == 0) + { + hit_eof = 1; + return 0; + } + + for (i = 0; i < chars_read; i++) + buf[i] = xlate[buf[i]]; + + return chars_read; +} + +/* Initialize a boolean membership set IN_SET with the character + values obtained by traversing the linked list of constructs S + using the function `get_next'. If COMPLEMENT_THIS_SET is + non-zero the resulting set is complemented. */ + +static void +set_initialize (s, complement_this_set, in_set) + struct Spec_list *s; + int complement_this_set; + SET_TYPE *in_set; +{ + int c; + int i; + + bzero (in_set, N_CHARS * sizeof (in_set[0])); + s->state = BEGIN_STATE; + while ((c = get_next (s, NULL)) != -1) + in_set[c] = 1; + if (complement_this_set) + for (i = 0; i < N_CHARS; i++) + in_set[i] = (!in_set[i]); +} + +void +main (argc, argv) + int argc; + char **argv; +{ + int c; + int non_option_args; + struct Spec_list buf1, buf2; + struct Spec_list *s1 = &buf1; + struct Spec_list *s2 = &buf2; + + program_name = argv[0]; + + while ((c = getopt_long (argc, argv, "cdst", long_options, + (int *) 0)) != EOF) + { + switch (c) + { + case 0: + break; + + case 'c': + complement = 1; + break; + + case 'd': + delete = 1; + break; + + case 's': + squeeze_repeats = 1; + break; + + case 't': + truncate_set1 = 1; + break; + + default: + usage (); + break; + } + } + + posix_pedantic = (getenv ("POSIXLY_CORRECT") != 0); + + non_option_args = argc - optind; + translating = (non_option_args == 2 && !delete); + + /* Change this test if it is legal to give tr no options and + no args at all. POSIX doesn't specifically say anything + either way, but it looks like they implied it's illegal + by omission. If you want to make tr do a slow imitation + of `cat' use `tr a a'. */ + if (non_option_args > 2) + usage (); + + if (!delete && !squeeze_repeats && non_option_args != 2) + error (1, 0, "two strings must be given when translating"); + + if (delete && squeeze_repeats && non_option_args != 2) + error (1, 0, "two strings must be given when both \ +deleting and squeezing repeats"); + + /* If --delete is given without --squeeze-repeats, then + only one string argument may be specified. But POSIX + says to ignore any string2 in this case, so if POSIXLY_CORRECT + is set, pretend we never saw string2. But I think + this deserves a fatal error, so that's the default. */ + if ((delete && !squeeze_repeats) && non_option_args != 1) + { + if (posix_pedantic && non_option_args == 2) + --non_option_args; + else + error (1, 0, + "only one string may be given when deleting without squeezing repeats"); + } + + spec_init (s1); + if (parse_str ((unsigned char *) argv[optind], s1)) + exit (1); + + if (non_option_args == 2) + { + spec_init (s2); + if (parse_str ((unsigned char *) argv[optind + 1], s2)) + exit (1); + } + else + s2 = NULL; + + validate (s1, s2); + + if (squeeze_repeats && non_option_args == 1) + { + set_initialize (s1, complement, in_squeeze_set); + squeeze_filter (io_buf, IO_BUF_SIZE, NULL); + } + else if (delete && non_option_args == 1) + { + int nr; + + set_initialize (s1, complement, in_delete_set); + do + { + nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL); + if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0) + error (1, errno, "write error"); + } + while (nr > 0); + } + else if (squeeze_repeats && delete && non_option_args == 2) + { + set_initialize (s1, complement, in_delete_set); + set_initialize (s2, 0, in_squeeze_set); + squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_delete); + } + else if (translating) + { + if (complement) + { + int i; + SET_TYPE *in_s1 = in_delete_set; + + set_initialize (s1, 0, in_s1); + s2->state = BEGIN_STATE; + for (i = 0; i < N_CHARS; i++) + xlate[i] = i; + for (i = 0; i < N_CHARS; i++) + { + if (!in_s1[i]) + { + int c = get_next (s2, NULL); + assert (c != -1 || truncate_set1); + if (c == -1) + { + /* This will happen when tr is invoked like e.g. + tr -cs A-Za-z0-9 '\012'. */ + break; + } + xlate[i] = c; + } + } + assert (get_next (s2, NULL) == -1 || truncate_set1); + } + else + { + int c1, c2; + int i; + enum Upper_Lower_class class_s1; + enum Upper_Lower_class class_s2; + + for (i = 0; i < N_CHARS; i++) + xlate[i] = i; + s1->state = BEGIN_STATE; + s2->state = BEGIN_STATE; + for (;;) + { + c1 = get_next (s1, &class_s1); + c2 = get_next (s2, &class_s2); + if (!class_ok[(int) class_s1][(int) class_s2]) + error (1, 0, + "misaligned or mismatched upper and/or lower classes"); + /* The following should have been checked by validate... */ + if (c2 == -1) + break; + xlate[c1] = c2; + } + assert (c1 == -1 || truncate_set1); + } + if (squeeze_repeats) + { + set_initialize (s2, 0, in_squeeze_set); + squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_xlate); + } + else + { + int chars_read; + + do + { + chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL); + if (chars_read > 0 + && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0) + error (1, errno, "write error"); + } + while (chars_read > 0); + } + } + + exit (0); +} + diff --git a/src/unexpand.c b/src/unexpand.c new file mode 100644 index 0000000..2733ef7 --- /dev/null +++ b/src/unexpand.c @@ -0,0 +1,432 @@ +/* unexpand - convert spaces to tabs + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* By default, convert only maximal strings of initial blanks and tabs + into tabs. + Preserves backspace characters in the output; they decrement the + column count for tab calculations. + The default action is equivalent to -8. + + Options: + --tabs=tab1[,tab2[,...]] + -t tab1[,tab2[,...]] + -tab1[,tab2[,...]] If only one tab stop is given, set the tabs tab1 + spaces apart instead of the default 8. Otherwise, + set the tabs at columns tab1, tab2, etc. (numbered from + 0); replace any tabs beyond the tabstops given with + single spaces. + --all + -a Use tabs wherever they would replace 2 or more spaces, + not just at the beginnings of lines. + + David MacKenzie */ + +#define _GNU_SOURCE +#include +#ifndef isblank +#define isblank(c) ((c) == ' ' || (c) == '\t') +#endif +#include +#include +#include +#include "system.h" + +#ifdef isascii +#define ISDIGIT(c) (isascii((c)) && isdigit((c))) +#else +#define ISDIGIT(c) (isdigit((c))) +#endif + +/* The number of bytes added at a time to the amount of memory + allocated for the output line. */ +#define OUTPUT_BLOCK 256 + +/* The number of bytes added at a time to the amount of memory + allocated for the list of tabstops. */ +#define TABLIST_BLOCK 256 + +char *xmalloc (); +char *xrealloc (); +void error (); + +FILE *next_file (); +void add_tabstop (); +void parse_tabstops (); +void unexpand (); +void usage (); +void validate_tabstops (); + +/* If nonzero, convert blanks even after nonblank characters have been + read on the line. */ +int convert_entire_line; + +/* If nonzero, the size of all tab stops. If zero, use `tab_list' instead. */ +int tab_size; + +/* Array of the explicit column numbers of the tab stops; + after `tab_list' is exhausted, the rest of the line is printed + unchanged. The first column is column 0. */ +int *tab_list; + +/* The index of the first invalid element of `tab_list', + where the next element can be added. */ +int first_free_tab; + +/* Null-terminated array of input filenames. */ +char **file_list; + +/* Default for `file_list' if no files are given on the command line. */ +char *stdin_argv[] = +{ + "-", NULL +}; + +/* Nonzero if we have ever read standard input. */ +int have_read_stdin; + +/* Status to return to the system. */ +int exit_status; + +/* The name this program was run with. */ +char *program_name; + +struct option longopts[] = +{ + {"tabs", 1, NULL, 't'}, + {"all", 0, NULL, 'a'}, + {NULL, 0, NULL, 0} +}; + +void +main (argc, argv) + int argc; + char **argv; +{ + int tabval = -1; /* Value of tabstop being read, or -1. */ + int c; /* Option character. */ + + have_read_stdin = 0; + exit_status = 0; + convert_entire_line = 0; + tab_list = NULL; + first_free_tab = 0; + program_name = argv[0]; + + while ((c = getopt_long (argc, argv, "at:,0123456789", longopts, (int *) 0)) + != EOF) + { + switch (c) + { + case '?': + usage (); + case 'a': + convert_entire_line = 1; + break; + case 't': + convert_entire_line = 1; + parse_tabstops (optarg); + break; + case ',': + add_tabstop (tabval); + tabval = -1; + break; + default: + if (tabval == -1) + tabval = 0; + tabval = tabval * 10 + c - '0'; + break; + } + } + + add_tabstop (tabval); + + validate_tabstops (tab_list, first_free_tab); + + if (first_free_tab == 0) + tab_size = 8; + else if (first_free_tab == 1) + tab_size = tab_list[0]; + else + tab_size = 0; + + if (optind == argc) + file_list = stdin_argv; + else + file_list = &argv[optind]; + + unexpand (); + + if (have_read_stdin && fclose (stdin) == EOF) + error (1, errno, "-"); + if (fclose (stdout) == EOF) + error (1, errno, "write error"); + exit (exit_status); +} + +/* Add the comma or blank separated list of tabstops STOPS + to the list of tabstops. */ + +void +parse_tabstops (stops) + char *stops; +{ + int tabval = -1; + + for (; *stops; stops++) + { + if (*stops == ',' || isblank (*stops)) + { + add_tabstop (tabval); + tabval = -1; + } + else if (ISDIGIT (*stops)) + { + if (tabval == -1) + tabval = 0; + tabval = tabval * 10 + *stops - '0'; + } + else + error (1, 0, "tab size contains an invalid character"); + } + + add_tabstop (tabval); +} + +/* Add tab stop TABVAL to the end of `tab_list', except + if TABVAL is -1, do nothing. */ + +void +add_tabstop (tabval) + int tabval; +{ + if (tabval == -1) + return; + if (first_free_tab % TABLIST_BLOCK == 0) + tab_list = (int *) xrealloc (tab_list, first_free_tab + TABLIST_BLOCK); + tab_list[first_free_tab++] = tabval; +} + +/* Check that the list of tabstops TABS, with ENTRIES entries, + contains only nonzero, ascending values. */ + +void +validate_tabstops (tabs, entries) + int *tabs; + int entries; +{ + int prev_tab = 0; + int i; + + for (i = 0; i < entries; i++) + { + if (tabs[i] == 0) + error (1, 0, "tab size cannot be 0"); + if (tabs[i] <= prev_tab) + error (1, 0, "tab sizes must be ascending"); + prev_tab = tabs[i]; + } +} + +/* Change spaces to tabs, writing to stdout. + Read each file in `file_list', in order. */ + +void +unexpand () +{ + FILE *fp; /* Input stream. */ + int c; /* Each input character. */ + /* Index in `tab_list' of next tabstop: */ + int tab_index = 0; /* For calculating width of pending tabs. */ + int print_tab_index = 0; /* For printing as many tabs as possible. */ + int column = 0; /* Column on screen of next char. */ + int next_tab_column; /* Column the next tab stop is on. */ + int convert = 1; /* If nonzero, perform translations. */ + int pending = 0; /* Pending columns of blanks. */ + + fp = next_file ((FILE *) NULL); + for (;;) + { + c = getc (fp); + if (c == EOF) + { + fp = next_file (fp); + if (fp == NULL) + break; /* No more files. */ + else + continue; + } + + if (c == ' ' && convert) + { + ++pending; + ++column; + } + else if (c == '\t' && convert) + { + if (tab_size == 0) + { + /* Do not let tab_index == first_free_tab; + stop when it is 1 less. */ + while (tab_index < first_free_tab - 1 + && column >= tab_list[tab_index]) + tab_index++; + next_tab_column = tab_list[tab_index]; + if (tab_index < first_free_tab - 1) + tab_index++; + if (column >= next_tab_column) + { + convert = 0; /* Ran out of tab stops. */ + goto flush_pend; + } + } + else + { + next_tab_column = column + tab_size - column % tab_size; + } + pending += next_tab_column - column; + column = next_tab_column; + } + else + { + flush_pend: + /* Flush pending spaces. Print as many tabs as possible, + then print the rest as spaces. */ + if (pending == 1) + { + putchar (' '); + pending = 0; + } + column -= pending; + while (pending != 0) + { + if (tab_size == 0) + { + /* Do not let tab_index == first_free_tab; + stop when it is 1 less. */ + while (tab_index < first_free_tab - 1 + && column >= tab_list[tab_index]) + print_tab_index++; + next_tab_column = tab_list[print_tab_index]; + if (print_tab_index < first_free_tab - 1) + print_tab_index++; + } + else + { + next_tab_column = column + tab_size - column % tab_size; + } + if (next_tab_column - column <= pending) + { + putchar ('\t'); + pending -= next_tab_column - column; + column = next_tab_column; + } + else + { + --print_tab_index; + column += pending; + while (pending != 0) + { + putchar (' '); + pending--; + } + } + } + + if (convert) + { + if (c == '\b') + { + if (column > 0) + --column; + } + else + { + ++column; + if (convert_entire_line == 0) + convert = 0; + } + } + + putchar (c); + + if (c == '\n') + { + tab_index = print_tab_index = 0; + column = pending = 0; + convert = 1; + } + } + } +} + +/* Close the old stream pointer FP if it is non-NULL, + and return a new one opened to read the next input file. + Open a filename of `-' as the standard input. + Return NULL if there are no more input files. */ + +FILE * +next_file (fp) + FILE *fp; +{ + static char *prev_file; + char *file; + + if (fp) + { + if (ferror (fp)) + { + error (0, errno, "%s", prev_file); + exit_status = 1; + } + if (fp == stdin) + clearerr (fp); /* Also clear EOF. */ + else if (fclose (fp) == EOF) + { + error (0, errno, "%s", prev_file); + exit_status = 1; + } + } + + while ((file = *file_list++) != NULL) + { + if (file[0] == '-' && file[1] == '\0') + { + have_read_stdin = 1; + prev_file = file; + return stdin; + } + fp = fopen (file, "r"); + if (fp) + { + prev_file = file; + return fp; + } + error (0, errno, "%s", file); + exit_status = 1; + } + return NULL; +} + +void +usage () +{ + fprintf (stderr, "\ +Usage: %s [-tab1[,tab2[,...]]] [-t tab1[,tab2[,...]]] [-a]\n\ + [--tabs=tab1[,tab2[,...]]] [--all] [file...]\n", + program_name); + exit (1); +} diff --git a/src/uniq.c b/src/uniq.c new file mode 100644 index 0000000..0968cba --- /dev/null +++ b/src/uniq.c @@ -0,0 +1,321 @@ +/* uniq -- remove duplicate lines from a sorted file + Copyright (C) 1986, 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* Written by Richard Stallman and David MacKenzie. */ + +#define _GNU_SOURCE +#include +#ifndef isblank +#define isblank(c) ((c) == ' ' || (c) == '\t') +#endif +#include +#include +#include +#include "system.h" +#include "linebuffer.h" + +#define min(x, y) ((x) < (y) ? (x) : (y)) + +char *find_field (); +int different (); +void check_file (); +void error (); +void usage (); +void writeline (); + +/* Number of fields to skip on each line when doing comparisons. */ +int skip_fields; + +/* Number of chars to skip after skipping any fields. */ +int skip_chars; + +/* Number of chars to compare; if 0, compare the whole lines. */ +int check_chars; + +enum countmode +{ + count_occurrences, /* -c Print count before output lines. */ + count_none /* Default. Do not print counts. */ +}; + +/* Whether and how to precede the output lines with a count of the number of + times they occurred in the input. */ +enum countmode countmode; + +enum output_mode +{ + output_repeated, /* -d Only lines that are repeated. */ + output_unique, /* -u Only lines that are not repeated. */ + output_all /* Default. Print first copy of each line. */ +}; + +/* Which lines to output. */ +enum output_mode mode; + +/* The name this program was run with. */ +char *program_name; + +struct option longopts[] = +{ + {"count", 0, NULL, 'c'}, + {"repeated", 0, NULL, 'd'}, + {"unique", 0, NULL, 'u'}, + {"skip-fields", 1, NULL, 'f'}, + {"skip-chars", 1, NULL, 's'}, + {"check-chars", 1, NULL, 'w'}, + {NULL, 0, NULL, 0} +}; + +void +main (argc, argv) + int argc; + char *argv[]; +{ + int optc; + char *infile = "-", *outfile = "-"; + + program_name = argv[0]; + skip_chars = 0; + skip_fields = 0; + check_chars = 0; + mode = output_all; + countmode = count_none; + + while ((optc = getopt_long (argc, argv, "0123456789cdf:s:uw:", longopts, + (int *) 0)) != EOF) + { + switch (optc) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + skip_fields = skip_fields * 10 + optc - '0'; + break; + + case 'c': + countmode = count_occurrences; + break; + + case 'd': + mode = output_repeated; + break; + + case 'f': /* Like '-#'. */ + skip_fields = atoi (optarg); + break; + + case 's': /* Like '+#'. */ + skip_chars = atoi (optarg); + break; + + case 'u': + mode = output_unique; + break; + + case 'w': + check_chars = atoi (optarg); + break; + + default: + usage (); + } + } + + while (optind < argc && argv[optind][0] == '+') + skip_chars = atoi (argv[optind++]); + + if (optind < argc) + infile = argv[optind++]; + + if (optind < argc) + outfile = argv[optind++]; + + if (optind < argc) + usage (); /* Extra arguments. */ + + check_file (infile, outfile); + + exit (0); +} + +/* Process input file INFILE with output to OUTFILE. + If either is "-", use the standard I/O stream for it instead. */ + +void +check_file (infile, outfile) + char *infile, *outfile; +{ + FILE *istream; + FILE *ostream; + struct linebuffer lb1, lb2; + struct linebuffer *thisline, *prevline, *exch; + char *prevfield, *thisfield; + int prevlen, thislen; + int match_count = 0; + + if (!strcmp (infile, "-")) + istream = stdin; + else + istream = fopen (infile, "r"); + if (istream == NULL) + error (1, errno, "%s", infile); + + if (!strcmp (outfile, "-")) + ostream = stdout; + else + ostream = fopen (outfile, "w"); + if (ostream == NULL) + error (1, errno, "%s", outfile); + + thisline = &lb1; + prevline = &lb2; + + initbuffer (thisline); + initbuffer (prevline); + + if (readline (prevline, istream) == 0) + goto closefiles; + prevfield = find_field (prevline); + prevlen = prevline->length - (prevfield - prevline->buffer); + + while (!feof (istream)) + { + if (readline (thisline, istream) == 0) + break; + thisfield = find_field (thisline); + thislen = thisline->length - (thisfield - thisline->buffer); + if (!different (thisfield, prevfield, thislen, prevlen)) + match_count++; + else + { + writeline (prevline, ostream, match_count); + match_count = 0; + + exch = prevline; + prevline = thisline; + thisline = exch; + prevfield = thisfield; + prevlen = thislen; + } + } + + writeline (prevline, ostream, match_count); + + closefiles: + if (ferror (istream) || fclose (istream) == EOF) + error (1, errno, "error reading %s", infile); + + if (ferror (ostream) || fclose (ostream) == EOF) + error (1, errno, "error writing %s", outfile); + + free (lb1.buffer); + free (lb2.buffer); +} + +/* Given a linebuffer LINE, + return a pointer to the beginning of the line's field to be compared. */ + +char * +find_field (line) + struct linebuffer *line; +{ + register int count; + register char *lp = line->buffer; + register int size = line->length; + register int i = 0; + + for (count = 0; count < skip_fields && i < size; count++) + { + while (i < size && isblank (lp[i])) + i++; + while (i < size && !isblank (lp[i])) + i++; + } + + for (count = 0; count < skip_chars && i < size; count++) + i++; + + return lp + i; +} + +/* Return zero if two strings OLD and NEW match, nonzero if not. + OLD and NEW point not to the beginnings of the lines + but rather to the beginnings of the fields to compare. + OLDLEN and NEWLEN are their lengths. */ + +int +different (old, new, oldlen, newlen) + char *old; + char *new; + int oldlen; + int newlen; +{ + register int order; + + if (check_chars) + { + if (oldlen > check_chars) + oldlen = check_chars; + if (newlen > check_chars) + newlen = check_chars; + } + order = memcmp (old, new, min (oldlen, newlen)); + if (order == 0) + return oldlen - newlen; + return order; +} + +/* Output the line in linebuffer LINE to stream STREAM + provided that the switches say it should be output. + If requested, print the number of times it occurred, as well; + LINECOUNT + 1 is the number of times that the line occurred. */ + +void +writeline (line, stream, linecount) + struct linebuffer *line; + FILE *stream; + int linecount; +{ + if ((mode == output_unique && linecount != 0) + || (mode == output_repeated && linecount == 0)) + return; + + if (countmode == count_occurrences) + fprintf (stream, "%7d\t", linecount + 1); + + fwrite (line->buffer, sizeof (char), line->length, stream); + putc ('\n', stream); +} + +void +usage () +{ + fprintf (stderr, "\ +Usage: %s [-cdu] [-f skip-fields] [-s skip-chars] [-w check-chars]\n\ + [-#skip-fields] [+#skip-chars] [--count] [--repeated] [--unique]\n\ + [--skip-fields=skip-fields] [--skip-chars=skip-chars]\n\ + [--check-chars=check-chars] [infile] [outfile]\n", + program_name); + exit (1); +} diff --git a/src/wc.c b/src/wc.c new file mode 100644 index 0000000..72d6ea6 --- /dev/null +++ b/src/wc.c @@ -0,0 +1,231 @@ +/* wc - print the number of bytes, words, and lines in files + Copyright (C) 1985, 1991 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* Written by Paul Rubin, phr@ocf.berkeley.edu + and David MacKenzie, djm@gnu.ai.mit.edu. */ + +#include +#include +#include +#include "system.h" + +/* Size of atomic reads. */ +#define BUFFER_SIZE (16 * 1024) + +void error (); +void wc (); +void wc_file (); +void write_counts (); + +/* Cumulative number of lines, words, and chars in all files so far. */ +unsigned long total_lines, total_words, total_chars; + +/* Which counts to print. */ +int print_lines, print_words, print_chars; + +/* Nonzero if we have ever read the standard input. */ +int have_read_stdin; + +/* The name this program was run with. */ +char *program_name; + +/* The error code to return to the system. */ +int exit_status; + +struct option longopts[] = +{ + {"bytes", 0, NULL, 'c'}, + {"chars", 0, NULL, 'c'}, + {"lines", 0, NULL, 'l'}, + {"words", 0, NULL, 'w'}, + {NULL, 0, NULL, 0} +}; + +void +main (argc, argv) + int argc; + char **argv; +{ + int optc; + int nfiles; + + program_name = argv[0]; + exit_status = 0; + print_lines = print_words = print_chars = 0; + total_lines = total_words = total_chars = 0; + + while ((optc = getopt_long (argc, argv, "clw", longopts, (int *) 0)) != EOF) + switch (optc) + { + case 'c': + print_chars = 1; + break; + + case 'l': + print_lines = 1; + break; + + case 'w': + print_words = 1; + break; + + default: + fprintf (stderr, "\ +Usage: %s [-clw] [--bytes] [--chars] [--lines] [--words] [file...]\n", argv[0]); + exit (1); + } + + if (print_lines + print_words + print_chars == 0) + print_lines = print_words = print_chars = 1; + + nfiles = argc - optind; + + if (nfiles == 0) + { + have_read_stdin = 1; + wc (0, ""); + } + else + { + for (; optind < argc; ++optind) + wc_file (argv[optind]); + + if (nfiles > 1) + write_counts (total_lines, total_words, total_chars, "total"); + } + + if (have_read_stdin && close (0)) + error (1, errno, "-"); + + exit (exit_status); +} + +void +wc_file (file) + char *file; +{ + if (!strcmp (file, "-")) + { + have_read_stdin = 1; + wc (0, file); + } + else + { + int fd = open (file, O_RDONLY); + if (fd == -1) + { + error (0, errno, "%s", file); + exit_status = 1; + return; + } + wc (fd, file); + if (close (fd)) + { + error (0, errno, "%s", file); + exit_status = 1; + } + } +} + +void +wc (fd, file) + int fd; + char *file; +{ + char buf[BUFFER_SIZE]; + register int bytes_read; + register int in_word = 0; + register unsigned long lines, words, chars; + struct stat stats; + + lines = words = chars = 0; + + if (print_chars && !print_words && !print_lines + && fstat (fd, &stats) == 0 && S_ISREG (stats.st_mode)) + { + chars = stats.st_size; + } + else + { + while ((bytes_read = read (fd, buf, BUFFER_SIZE)) > 0) + { + register char *p = buf; + + chars += bytes_read; + do + { + switch (*p++) + { + case '\n': + lines++; + /* Fall through. */ + case '\r': + case '\f': + case '\t': + case '\v': + case ' ': + if (in_word) + { + in_word = 0; + words++; + } + break; + default: + in_word = 1; + break; + } + } + while (--bytes_read); + } + if (bytes_read < 0) + { + error (0, errno, "%s", file); + exit_status = 1; + } + if (in_word) + words++; + } + + write_counts (lines, words, chars, file); + total_lines += lines; + total_words += words; + total_chars += chars; +} + +void +write_counts (lc, wc, cc, file) + unsigned long lc, wc, cc; + char *file; +{ + if (print_lines) + printf ("%7lu", lc); + if (print_words) + { + if (print_lines) + putchar (' '); + printf ("%7lu", wc); + } + if (print_chars) + { + if (print_lines || print_words) + putchar (' '); + printf ("%7lu", cc); + } + if (*file) + printf (" %s", file); + putchar ('\n'); +} -- 2.7.4